// -----// IR Dump Before AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- // module { func.func public @main(%arg0: tensor, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> { %c2 = arith.constant 2 : index %c11 = arith.constant 11 : index %c13 = arith.constant 13 : index %c6 = arith.constant 6 : index %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %3 = tensor.empty() : tensor<2x4x7x9xf32> %cst_0 = arith.constant 0.000000e+00 : f32 %cast = tensor.cast %1 : tensor to tensor<2x6x11x13xf32> %4 = linalg.fill ins(%cst_0 : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } return %0 : tensor<2x4x7x9xf32> } } // -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- // module { func.func public @main(%arg0: tensor, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> { %c2 = arith.constant 2 : index %c11 = arith.constant 11 : index %c13 = arith.constant 13 : index %c6 = arith.constant 6 : index %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %3 = tensor.empty() : tensor<2x4x7x9xf32> %cst_0 = arith.constant 0.000000e+00 : f32 %cast = tensor.cast %1 : tensor to tensor<2x6x11x13xf32> %4 = linalg.fill ins(%cst_0 : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } return %0 : tensor<2x4x7x9xf32> } } // -----// IR Dump Before IREEImportPublicPass (iree-import-public) //----- // module { func.func public @main(%arg0: tensor, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> { %c2 = arith.constant 2 : index %c11 = arith.constant 11 : index %c13 = arith.constant 13 : index %c6 = arith.constant 6 : index %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %3 = tensor.empty() : tensor<2x4x7x9xf32> %cst_0 = arith.constant 0.000000e+00 : f32 %cast = tensor.cast %1 : tensor to tensor<2x6x11x13xf32> %4 = linalg.fill ins(%cst_0 : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } return %0 : tensor<2x4x7x9xf32> } } // -----// IR Dump After IREEImportPublicPass (iree-import-public) //----- // module { util.func public @main(%arg0: tensor, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> { %c2 = arith.constant 2 : index %c11 = arith.constant 11 : index %c13 = arith.constant 13 : index %c6 = arith.constant 6 : index %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %3 = tensor.empty() : tensor<2x4x7x9xf32> %cst_0 = arith.constant 0.000000e+00 : f32 %cast = tensor.cast %1 : tensor to tensor<2x6x11x13xf32> %4 = linalg.fill ins(%cst_0 : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } util.return %0 : tensor<2x4x7x9xf32> } } // -----// IR Dump Before ImportMLProgramPass (iree-import-ml-program) //----- // module { util.func public @main(%arg0: tensor, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> { %c2 = arith.constant 2 : index %c11 = arith.constant 11 : index %c13 = arith.constant 13 : index %c6 = arith.constant 6 : index %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %3 = tensor.empty() : tensor<2x4x7x9xf32> %cst_0 = arith.constant 0.000000e+00 : f32 %cast = tensor.cast %1 : tensor to tensor<2x6x11x13xf32> %4 = linalg.fill ins(%cst_0 : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } util.return %0 : tensor<2x4x7x9xf32> } } // -----// IR Dump After ImportMLProgramPass (iree-import-ml-program) //----- // module { util.func public @main(%arg0: tensor, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> { %c2 = arith.constant 2 : index %c11 = arith.constant 11 : index %c13 = arith.constant 13 : index %c6 = arith.constant 6 : index %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %3 = tensor.empty() : tensor<2x4x7x9xf32> %cst_0 = arith.constant 0.000000e+00 : f32 %cast = tensor.cast %1 : tensor to tensor<2x6x11x13xf32> %4 = linalg.fill ins(%cst_0 : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } util.return %0 : tensor<2x4x7x9xf32> } } // -----// IR Dump Before SanitizeModuleNamesPass (iree-sanitize-module-names) //----- // module { util.func public @main(%arg0: tensor, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> { %c2 = arith.constant 2 : index %c11 = arith.constant 11 : index %c13 = arith.constant 13 : index %c6 = arith.constant 6 : index %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %3 = tensor.empty() : tensor<2x4x7x9xf32> %cst_0 = arith.constant 0.000000e+00 : f32 %cast = tensor.cast %1 : tensor to tensor<2x6x11x13xf32> %4 = linalg.fill ins(%cst_0 : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } util.return %0 : tensor<2x4x7x9xf32> } } // -----// IR Dump After SanitizeModuleNamesPass (iree-sanitize-module-names) //----- // module { util.func public @main(%arg0: tensor, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> { %c2 = arith.constant 2 : index %c11 = arith.constant 11 : index %c13 = arith.constant 13 : index %c6 = arith.constant 6 : index %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %3 = tensor.empty() : tensor<2x4x7x9xf32> %cst_0 = arith.constant 0.000000e+00 : f32 %cast = tensor.cast %1 : tensor to tensor<2x6x11x13xf32> %4 = linalg.fill ins(%cst_0 : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } util.return %0 : tensor<2x4x7x9xf32> } } // -----// IR Dump Before ConvertMeshToFlowPass (iree-convert-mesh-to-flow) //----- // module { util.func public @main(%arg0: tensor, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> { %c2 = arith.constant 2 : index %c11 = arith.constant 11 : index %c13 = arith.constant 13 : index %c6 = arith.constant 6 : index %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %3 = tensor.empty() : tensor<2x4x7x9xf32> %cst_0 = arith.constant 0.000000e+00 : f32 %cast = tensor.cast %1 : tensor to tensor<2x6x11x13xf32> %4 = linalg.fill ins(%cst_0 : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } util.return %0 : tensor<2x4x7x9xf32> } } // -----// IR Dump After ConvertMeshToFlowPass (iree-convert-mesh-to-flow) //----- // module { util.func public @main(%arg0: tensor, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> { %c2 = arith.constant 2 : index %c11 = arith.constant 11 : index %c13 = arith.constant 13 : index %c6 = arith.constant 6 : index %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %3 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %1 : tensor to tensor<2x6x11x13xf32> %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } util.return %0 : tensor<2x4x7x9xf32> } } // -----// IR Dump Before DemoteF64ToF32Pass (iree-input-conversion-demote-f64-to-f32) //----- // module { util.func public @main(%arg0: tensor, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> { %c2 = arith.constant 2 : index %c11 = arith.constant 11 : index %c13 = arith.constant 13 : index %c6 = arith.constant 6 : index %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %3 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %1 : tensor to tensor<2x6x11x13xf32> %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } util.return %0 : tensor<2x4x7x9xf32> } } // -----// IR Dump After DemoteF64ToF32Pass (iree-input-conversion-demote-f64-to-f32) //----- // module { util.func public @main(%arg0: tensor, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> { %c2 = arith.constant 2 : index %c11 = arith.constant 11 : index %c13 = arith.constant 13 : index %c6 = arith.constant 6 : index %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %3 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %1 : tensor to tensor<2x6x11x13xf32> %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } util.return %0 : tensor<2x4x7x9xf32> } } // -----// IR Dump Before mlir::iree_compiler::IREE::ABI::ConvertStreamableOpsPass (iree-abi-convert-streamable-ops) //----- // module { util.func public @main(%arg0: tensor, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> { %c2 = arith.constant 2 : index %c11 = arith.constant 11 : index %c13 = arith.constant 13 : index %c6 = arith.constant 6 : index %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %3 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %1 : tensor to tensor<2x6x11x13xf32> %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } util.return %0 : tensor<2x4x7x9xf32> } } // -----// IR Dump After mlir::iree_compiler::IREE::ABI::ConvertStreamableOpsPass (iree-abi-convert-streamable-ops) //----- // module { util.func public @main(%arg0: tensor, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> { %c2 = arith.constant 2 : index %c11 = arith.constant 11 : index %c13 = arith.constant 13 : index %c6 = arith.constant 6 : index %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %3 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %1 : tensor to tensor<2x6x11x13xf32> %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } util.return %0 : tensor<2x4x7x9xf32> } } // -----// IR Dump Before mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) //----- // module { util.func public @main(%arg0: tensor, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> { %c2 = arith.constant 2 : index %c11 = arith.constant 11 : index %c13 = arith.constant 13 : index %c6 = arith.constant 6 : index %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %3 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %1 : tensor to tensor<2x6x11x13xf32> %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } util.return %0 : tensor<2x4x7x9xf32> } } // -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) //----- // module { util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = util.call @_main(%4, %5) : (tensor, tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } util.func private @_main(%arg0: tensor, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> { %c2 = arith.constant 2 : index %c11 = arith.constant 11 : index %c13 = arith.constant 13 : index %c6 = arith.constant 6 : index %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %3 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %1 : tensor to tensor<2x6x11x13xf32> %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } util.return %0 : tensor<2x4x7x9xf32> } } // -----// IR Dump Before Inliner (inline) //----- // module { util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = util.call @_main(%4, %5) : (tensor, tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } util.func private @_main(%arg0: tensor, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> { %c2 = arith.constant 2 : index %c11 = arith.constant 11 : index %c13 = arith.constant 13 : index %c6 = arith.constant 6 : index %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %3 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %1 : tensor to tensor<2x6x11x13xf32> %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } util.return %0 : tensor<2x4x7x9xf32> } } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // util.func private @_main(%arg0: tensor, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> { %c2 = arith.constant 2 : index %c11 = arith.constant 11 : index %c13 = arith.constant 13 : index %c6 = arith.constant 6 : index %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %3 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %1 : tensor to tensor<2x6x11x13xf32> %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } util.return %0 : tensor<2x4x7x9xf32> } // -----// IR Dump After Canonicalizer (canonicalize) //----- // util.func private @_main(%arg0: tensor, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> { %c2 = arith.constant 2 : index %c11 = arith.constant 11 : index %c13 = arith.constant 13 : index %c6 = arith.constant 6 : index %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %3 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %1 : tensor to tensor<2x6x11x13xf32> %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } util.return %0 : tensor<2x4x7x9xf32> } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = util.call @_main(%4, %5) : (tensor, tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = util.call @_main(%4, %5) : (tensor, tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %c2 = arith.constant 2 : index %c11 = arith.constant 11 : index %c13 = arith.constant 13 : index %c6 = arith.constant 6 : index %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After Inliner (inline) //----- // module { util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before SymbolDCE (symbol-dce) //----- // module { util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After SymbolDCE (symbol-dce) //----- // module { util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- // module { util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- // module attributes {hal.device.targets = [#hal.device.alias<"llvm-cpu"> : !hal.device]} { util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- // module attributes {hal.device.targets = [#hal.device.alias<"llvm-cpu"> : !hal.device]} { util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- // module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #hal.device.alias<"llvm-cpu"> : !hal.device util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- // module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #hal.device.alias<"llvm-cpu"> : !hal.device util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- // module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #hal.device.alias<"llvm-cpu"> : !hal.device util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- // module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #hal.device.alias<"llvm-cpu"> : !hal.device util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before VerifyDevicesPass (iree-hal-verify-devices) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before LinalgQuantizedConvToConvPass (iree-global-opt-quantized-conv-to-conv) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After LinalgQuantizedConvToConvPass (iree-global-opt-quantized-conv-to-conv) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before LinalgQuantizedMatmulToMatmulPass (iree-global-opt-quantized-matmul-to-matmul) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After LinalgQuantizedMatmulToMatmulPass (iree-global-opt-quantized-matmul-to-matmul) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before RemoveZeroExtentTensorsPass (iree-global-opt-remove-zero-extent-tensors) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After RemoveZeroExtentTensorsPass (iree-global-opt-remove-zero-extent-tensors) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before DetachElementwiseFromNamedOpsPass (iree-global-opt-detach-elementwise-from-named-ops) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After DetachElementwiseFromNamedOpsPass (iree-global-opt-detach-elementwise-from-named-ops) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before LinalgNamedOpConversionPass (linalg-named-op-conversion) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After LinalgNamedOpConversionPass (linalg-named-op-conversion) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before Convert1X1FilterConv2DToMatmulPass (iree-global-opt-convert-1x1-filter-conv2d-to-matmul) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After Convert1X1FilterConv2DToMatmulPass (iree-global-opt-convert-1x1-filter-conv2d-to-matmul) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before EraseUnusedLinalgOperandsPass (iree-global-opt-erase-unused-linalg-operands) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After EraseUnusedLinalgOperandsPass (iree-global-opt-erase-unused-linalg-operands) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before ExpandTensorShapesPass (iree-global-opt-expand-tensor-shapes) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After ExpandTensorShapesPass (iree-global-opt-expand-tensor-shapes) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before ConvertElementwiseToLinalgPass (convert-elementwise-to-linalg) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After ConvertElementwiseToLinalgPass (convert-elementwise-to-linalg) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before DecomposeConcatPass (iree-global-opt-decompose-concat) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After DecomposeConcatPass (iree-global-opt-decompose-concat) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before FoldUnitExtentDimsPass (iree-dispatch-creation-fold-unit-extent-dims) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After FoldUnitExtentDimsPass (iree-dispatch-creation-fold-unit-extent-dims) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before DemoteContractionInputsToBF16Pass (iree-global-opt-demote-contraction-inputs-to-bf16) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After DemoteContractionInputsToBF16Pass (iree-global-opt-demote-contraction-inputs-to-bf16) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before SetEncodingPass (iree-dispatch-creation-set-encoding) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After SetEncodingPass (iree-dispatch-creation-set-encoding) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before MaterializeHomogeneousEncodingsPass (iree-global-opt-materialize-homogeneous-encodings) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before CPUMaterializeHostEncodingPass (iree-codegen-cpu-materialize-host-encoding) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After CPUMaterializeHostEncodingPass (iree-codegen-cpu-materialize-host-encoding) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After MaterializeHomogeneousEncodingsPass (iree-global-opt-materialize-homogeneous-encodings) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before CSE (cse) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After CSE (cse) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before SimplifyPackUnpackPass (iree-global-opt-simplify-pack-unpack) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After SimplifyPackUnpackPass (iree-global-opt-simplify-pack-unpack) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before DataLayoutPropagationPass (iree-global-opt-data-layout-propagation) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After DataLayoutPropagationPass (iree-global-opt-data-layout-propagation) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before GlobalLoopInvariantCodeMotionPass (iree-global-opt-loop-invariant-code-motion) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After GlobalLoopInvariantCodeMotionPass (iree-global-opt-loop-invariant-code-motion) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before IPO (iree-util-ipo) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After IPO (iree-util-ipo) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before CSE (cse) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After CSE (cse) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before HoistIntoGlobals (iree-util-hoist-into-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After HoistIntoGlobals (iree-util-hoist-into-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before JitGlobalsPass (iree-consteval-jit-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After JitGlobalsPass (iree-consteval-jit-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before TensorPadToTensorInsertSlicePass (iree-dispatch-creation-tensor-pad-to-tensor-insert-slice) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After TensorPadToTensorInsertSlicePass (iree-dispatch-creation-tensor-pad-to-tensor-insert-slice) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before FixedPointIterator (iree-util-fixed-point-iterator) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before IPO (iree-util-ipo) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After IPO (iree-util-ipo) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After FixedPointIterator (iree-util-fixed-point-iterator) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before FusionPreprocessingPass (iree-dispatch-creation-fusion-preprocessing) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After FusionPreprocessingPass (iree-dispatch-creation-fusion-preprocessing) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before BubbleUpExpandShapesPass (iree-dispatch-creation-bubble-up-expand-shapes) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After BubbleUpExpandShapesPass (iree-dispatch-creation-bubble-up-expand-shapes) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before SinkReshapesPass (iree-dispatch-creation-sink-reshapes) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After SinkReshapesPass (iree-dispatch-creation-sink-reshapes) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before FuseMultiUseElementwiseProducerPass (iree-dispatch-creation-fuse-multi-use-elementwise-producer) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After FuseMultiUseElementwiseProducerPass (iree-dispatch-creation-fuse-multi-use-elementwise-producer) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before SplitReductionPass (iree-dispatch-creation-split-reduction-ops) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After SplitReductionPass (iree-dispatch-creation-split-reduction-ops) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before TransposeGenericOpsPass (iree-dispatch-creation-transpose-generic-ops) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After TransposeGenericOpsPass (iree-dispatch-creation-transpose-generic-ops) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before FormScalarDispatchesPass (iree-dispatch-creation-form-scalar-dispatches) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After FormScalarDispatchesPass (iree-dispatch-creation-form-scalar-dispatches) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before FormDispatchRegionsPass (iree-dispatch-creation-form-dispatch-regions) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After FormDispatchRegionsPass (iree-dispatch-creation-form-dispatch-regions) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before CloneProducersIntoDispatchRegionsPass (iree-dispatch-creation-clone-producers-into-dispatch-regions) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CloneProducersIntoDispatchRegionsPass (iree-dispatch-creation-clone-producers-into-dispatch-regions) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before CollapseDimensionsPass (iree-dispatch-creation-collapse-dimensions) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CollapseDimensionsPass (iree-dispatch-creation-collapse-dimensions) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before ConvertDispatchRegionsToWorkgroupsPass (iree-dispatch-creation-convert-dispatch-regions-to-workgroups) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After ConvertDispatchRegionsToWorkgroupsPass (iree-dispatch-creation-convert-dispatch-regions-to-workgroups) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before ConvertTensorToFlowPass (iree-dispatch-creation-convert-tensor-to-flow) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After ConvertTensorToFlowPass (iree-dispatch-creation-convert-tensor-to-flow) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before MaterializeDefaultWorkgroupCountRegionPass (iree-dispatch-creation-materialize-default-workgroup-count-region) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg4, %arg5, %arg6, %arg7} -> tensor %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %10 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %8 : tensor to tensor<2x6x11x13xf32> %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After MaterializeDefaultWorkgroupCountRegionPass (iree-dispatch-creation-materialize-default-workgroup-count-region) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %8 = flow.dispatch.workload.ordinal %arg4, 0 : index %9 = flow.dispatch.workload.ordinal %arg5, 1 : index %10 = flow.dispatch.workload.ordinal %arg6, 2 : index %11 = flow.dispatch.workload.ordinal %arg7, 3 : index %cst = arith.constant 0.000000e+00 : f32 %12 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%8, %9, %10, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%8, %9, %10, %11} -> tensor %13 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %14 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %12 : tensor to tensor<2x6x11x13xf32> %15 = linalg.fill ins(%cst : f32) outs(%14 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %16 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %13 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%15 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %16, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } count(%arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg2, %arg3, %arg4, %arg5 flow.return %x, %y, %z : index, index, index } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before VerifyInputLegalityPass (iree-verify-input-legality) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %8 = flow.dispatch.workload.ordinal %arg4, 0 : index %9 = flow.dispatch.workload.ordinal %arg5, 1 : index %10 = flow.dispatch.workload.ordinal %arg6, 2 : index %11 = flow.dispatch.workload.ordinal %arg7, 3 : index %cst = arith.constant 0.000000e+00 : f32 %12 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%8, %9, %10, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%8, %9, %10, %11} -> tensor %13 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %14 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %12 : tensor to tensor<2x6x11x13xf32> %15 = linalg.fill ins(%cst : f32) outs(%14 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %16 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %13 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%15 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %16, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } count(%arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg2, %arg3, %arg4, %arg5 flow.return %x, %y, %z : index, index, index } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After VerifyInputLegalityPass (iree-verify-input-legality) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %8 = flow.dispatch.workload.ordinal %arg4, 0 : index %9 = flow.dispatch.workload.ordinal %arg5, 1 : index %10 = flow.dispatch.workload.ordinal %arg6, 2 : index %11 = flow.dispatch.workload.ordinal %arg7, 3 : index %cst = arith.constant 0.000000e+00 : f32 %12 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%8, %9, %10, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%8, %9, %10, %11} -> tensor %13 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %14 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %12 : tensor to tensor<2x6x11x13xf32> %15 = linalg.fill ins(%cst : f32) outs(%14 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %16 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %13 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%15 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %16, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } count(%arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg2, %arg3, %arg4, %arg5 flow.return %x, %y, %z : index, index, index } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before CaptureDynamicDimsPass (iree-flow-capture-dynamic-dims) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor>) { %8 = flow.dispatch.workload.ordinal %arg4, 0 : index %9 = flow.dispatch.workload.ordinal %arg5, 1 : index %10 = flow.dispatch.workload.ordinal %arg6, 2 : index %11 = flow.dispatch.workload.ordinal %arg7, 3 : index %cst = arith.constant 0.000000e+00 : f32 %12 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%8, %9, %10, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%8, %9, %10, %11} -> tensor %13 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %14 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %12 : tensor to tensor<2x6x11x13xf32> %15 = linalg.fill ins(%cst : f32) outs(%14 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %16 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %13 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%15 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %16, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } count(%arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg2, %arg3, %arg4, %arg5 flow.return %x, %y, %z : index, index, index } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CaptureDynamicDimsPass (iree-flow-capture-dynamic-dims) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index, %arg12: !flow.dispatch.tensor>) { %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor>{%arg8, %arg9, %arg10, %arg11} %9 = flow.dispatch.workload.ordinal %arg4, 0 : index %10 = flow.dispatch.workload.ordinal %arg5, 1 : index %11 = flow.dispatch.workload.ordinal %arg6, 2 : index %12 = flow.dispatch.workload.ordinal %arg7, 3 : index %cst = arith.constant 0.000000e+00 : f32 %13 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%9, %10, %11, %12], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%9, %10, %11, %12} -> tensor %14 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %15 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %13 : tensor to tensor<2x6x11x13xf32> %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %17 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %14 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%16 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %17, %arg12, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } count(%arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg2, %arg3, %arg4, %arg5 flow.return %x, %y, %z : index, index, index } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index, %arg12: !flow.dispatch.tensor>) { %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor>{%arg8, %arg9, %arg10, %arg11} %9 = flow.dispatch.workload.ordinal %arg4, 0 : index %10 = flow.dispatch.workload.ordinal %arg5, 1 : index %11 = flow.dispatch.workload.ordinal %arg6, 2 : index %12 = flow.dispatch.workload.ordinal %arg7, 3 : index %cst = arith.constant 0.000000e+00 : f32 %13 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%9, %10, %11, %12], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%9, %10, %11, %12} -> tensor %14 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %15 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %13 : tensor to tensor<2x6x11x13xf32> %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %17 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %14 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%16 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %17, %arg12, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } count(%arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg2, %arg3, %arg4, %arg5 flow.return %x, %y, %z : index, index, index } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index, %arg12: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor>{%arg8, %arg9, %arg10, %arg11} %9 = flow.dispatch.workload.ordinal %arg4, 0 : index %10 = flow.dispatch.workload.ordinal %arg5, 1 : index %11 = flow.dispatch.workload.ordinal %arg6, 2 : index %12 = flow.dispatch.workload.ordinal %arg7, 3 : index %13 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%9, %10, %11, %12], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg8, %arg9, %arg10, %arg11} -> tensor %14 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %15 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %13 : tensor to tensor<2x6x11x13xf32> %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %17 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %14 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%16 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %17, %arg12, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } count(%arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg2, %arg3, %arg4, %arg5 flow.return %x, %y, %z : index, index, index } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index, %arg12: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor>{%arg8, %arg9, %arg10, %arg11} %9 = flow.dispatch.workload.ordinal %arg4, 0 : index %10 = flow.dispatch.workload.ordinal %arg5, 1 : index %11 = flow.dispatch.workload.ordinal %arg6, 2 : index %12 = flow.dispatch.workload.ordinal %arg7, 3 : index %13 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%9, %10, %11, %12], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg8, %arg9, %arg10, %arg11} -> tensor %14 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %15 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %13 : tensor to tensor<2x6x11x13xf32> %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %17 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %14 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%16 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %17, %arg12, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } count(%arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg2, %arg3, %arg4, %arg5 flow.return %x, %y, %z : index, index, index } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index, %arg12: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor>{%arg8, %arg9, %arg10, %arg11} %9 = flow.dispatch.workload.ordinal %arg4, 0 : index %10 = flow.dispatch.workload.ordinal %arg5, 1 : index %11 = flow.dispatch.workload.ordinal %arg6, 2 : index %12 = flow.dispatch.workload.ordinal %arg7, 3 : index %13 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%9, %10, %11, %12], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg8, %arg9, %arg10, %arg11} -> tensor %14 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %15 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %13 : tensor to tensor<2x6x11x13xf32> %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %17 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %14 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%16 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %17, %arg12, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } count(%arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg2, %arg3, %arg4, %arg5 flow.return %x, %y, %z : index, index, index } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before InitializeEmptyTensorsPass (iree-flow-initialize-empty-tensors) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index, %arg12: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor>{%arg8, %arg9, %arg10, %arg11} %9 = flow.dispatch.workload.ordinal %arg4, 0 : index %10 = flow.dispatch.workload.ordinal %arg5, 1 : index %11 = flow.dispatch.workload.ordinal %arg6, 2 : index %12 = flow.dispatch.workload.ordinal %arg7, 3 : index %13 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%9, %10, %11, %12], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg8, %arg9, %arg10, %arg11} -> tensor %14 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %15 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %13 : tensor to tensor<2x6x11x13xf32> %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %17 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %14 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%16 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %17, %arg12, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } count(%arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg2, %arg3, %arg4, %arg5 flow.return %x, %y, %z : index, index, index } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After InitializeEmptyTensorsPass (iree-flow-initialize-empty-tensors) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index, %arg12: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor>{%arg8, %arg9, %arg10, %arg11} %9 = flow.dispatch.workload.ordinal %arg4, 0 : index %10 = flow.dispatch.workload.ordinal %arg5, 1 : index %11 = flow.dispatch.workload.ordinal %arg6, 2 : index %12 = flow.dispatch.workload.ordinal %arg7, 3 : index %13 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%9, %10, %11, %12], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg8, %arg9, %arg10, %arg11} -> tensor %14 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %15 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %13 : tensor to tensor<2x6x11x13xf32> %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %17 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %14 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%16 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %17, %arg12, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } count(%arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg2, %arg3, %arg4, %arg5 flow.return %x, %y, %z : index, index, index } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before OutlineDispatchExternsPass (iree-flow-outline-dispatch-externs) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index, %arg12: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor>{%arg8, %arg9, %arg10, %arg11} %9 = flow.dispatch.workload.ordinal %arg4, 0 : index %10 = flow.dispatch.workload.ordinal %arg5, 1 : index %11 = flow.dispatch.workload.ordinal %arg6, 2 : index %12 = flow.dispatch.workload.ordinal %arg7, 3 : index %13 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%9, %10, %11, %12], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg8, %arg9, %arg10, %arg11} -> tensor %14 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %15 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %13 : tensor to tensor<2x6x11x13xf32> %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %17 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %14 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%16 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %17, %arg12, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } count(%arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg2, %arg3, %arg4, %arg5 flow.return %x, %y, %z : index, index, index } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After OutlineDispatchExternsPass (iree-flow-outline-dispatch-externs) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index, %arg12: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor>{%arg8, %arg9, %arg10, %arg11} %9 = flow.dispatch.workload.ordinal %arg4, 0 : index %10 = flow.dispatch.workload.ordinal %arg5, 1 : index %11 = flow.dispatch.workload.ordinal %arg6, 2 : index %12 = flow.dispatch.workload.ordinal %arg7, 3 : index %13 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%9, %10, %11, %12], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg8, %arg9, %arg10, %arg11} -> tensor %14 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %15 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %13 : tensor to tensor<2x6x11x13xf32> %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %17 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %14 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%16 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %17, %arg12, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } count(%arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg2, %arg3, %arg4, %arg5 flow.return %x, %y, %z : index, index, index } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before OutlineDispatchRegionsPass (iree-flow-outline-dispatch-regions) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch.workgroups[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> = (%arg2: !flow.dispatch.tensor>, %arg3: !flow.dispatch.tensor>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index, %arg12: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor>{%arg8, %arg9, %arg10, %arg11} %9 = flow.dispatch.workload.ordinal %arg4, 0 : index %10 = flow.dispatch.workload.ordinal %arg5, 1 : index %11 = flow.dispatch.workload.ordinal %arg6, 2 : index %12 = flow.dispatch.workload.ordinal %arg7, 3 : index %13 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%9, %10, %11, %12], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg8, %arg9, %arg10, %arg11} -> tensor %14 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %15 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %13 : tensor to tensor<2x6x11x13xf32> %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %17 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %14 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%16 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %17, %arg12, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> flow.return } count(%arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg2, %arg3, %arg4, %arg5 flow.return %x, %y, %z : index, index, index } %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After OutlineDispatchRegionsPass (iree-flow-outline-dispatch-regions) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local flow.executable private @main_dispatch_0 { flow.executable.export public @main_dispatch_0 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 flow.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %1 = flow.dispatch.workload.ordinal %arg2, 0 : index %2 = flow.dispatch.workload.ordinal %arg3, 1 : index %3 = flow.dispatch.workload.ordinal %arg4, 2 : index %4 = flow.dispatch.workload.ordinal %arg5, 3 : index %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %7 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %5 : tensor to tensor<2x6x11x13xf32> %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before AnnotateDispatchesPass (iree-flow-annotate-dispatches) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local flow.executable private @main_dispatch_0 { flow.executable.export public @main_dispatch_0 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 flow.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %1 = flow.dispatch.workload.ordinal %arg2, 0 : index %2 = flow.dispatch.workload.ordinal %arg3, 1 : index %3 = flow.dispatch.workload.ordinal %arg4, 2 : index %4 = flow.dispatch.workload.ordinal %arg5, 3 : index %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %7 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %5 : tensor to tensor<2x6x11x13xf32> %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After AnnotateDispatchesPass (iree-flow-annotate-dispatches) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local flow.executable private @main_dispatch_0 { flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 flow.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %1 = flow.dispatch.workload.ordinal %arg2, 0 : index %2 = flow.dispatch.workload.ordinal %arg3, 1 : index %3 = flow.dispatch.workload.ordinal %arg4, 2 : index %4 = flow.dispatch.workload.ordinal %arg5, 3 : index %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %7 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %5 : tensor to tensor<2x6x11x13xf32> %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before StripDebugOps (iree-util-strip-debug-ops) //----- // flow.executable private @main_dispatch_0 { flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 flow.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %1 = flow.dispatch.workload.ordinal %arg2, 0 : index %2 = flow.dispatch.workload.ordinal %arg3, 1 : index %3 = flow.dispatch.workload.ordinal %arg4, 2 : index %4 = flow.dispatch.workload.ordinal %arg5, 3 : index %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %7 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %5 : tensor to tensor<2x6x11x13xf32> %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } // -----// IR Dump After StripDebugOps (iree-util-strip-debug-ops) //----- // flow.executable private @main_dispatch_0 { flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 flow.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %1 = flow.dispatch.workload.ordinal %arg2, 0 : index %2 = flow.dispatch.workload.ordinal %arg3, 1 : index %3 = flow.dispatch.workload.ordinal %arg4, 2 : index %4 = flow.dispatch.workload.ordinal %arg5, 3 : index %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %7 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %5 : tensor to tensor<2x6x11x13xf32> %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before DeduplicateExecutablesPass (iree-flow-deduplicate-executables) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local flow.executable private @main_dispatch_0 { flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 flow.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %1 = flow.dispatch.workload.ordinal %arg2, 0 : index %2 = flow.dispatch.workload.ordinal %arg3, 1 : index %3 = flow.dispatch.workload.ordinal %arg4, 2 : index %4 = flow.dispatch.workload.ordinal %arg5, 3 : index %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %7 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %5 : tensor to tensor<2x6x11x13xf32> %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After DeduplicateExecutablesPass (iree-flow-deduplicate-executables) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local flow.executable private @main_dispatch_0 { flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 flow.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %1 = flow.dispatch.workload.ordinal %arg2, 0 : index %2 = flow.dispatch.workload.ordinal %arg3, 1 : index %3 = flow.dispatch.workload.ordinal %arg4, 2 : index %4 = flow.dispatch.workload.ordinal %arg5, 3 : index %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %7 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %5 : tensor to tensor<2x6x11x13xf32> %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before CleanupTensorShapesPass (iree-flow-cleanup-tensor-shapes) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CleanupTensorShapesPass (iree-flow-cleanup-tensor-shapes) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before FixedPointIterator (iree-util-fixed-point-iterator) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local flow.executable private @main_dispatch_0 { flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 flow.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %1 = flow.dispatch.workload.ordinal %arg2, 0 : index %2 = flow.dispatch.workload.ordinal %arg3, 1 : index %3 = flow.dispatch.workload.ordinal %arg4, 2 : index %4 = flow.dispatch.workload.ordinal %arg5, 3 : index %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %7 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %5 : tensor to tensor<2x6x11x13xf32> %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before OutlineConstantsPass (iree-flow-outline-constants) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local flow.executable private @main_dispatch_0 { flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 flow.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %1 = flow.dispatch.workload.ordinal %arg2, 0 : index %2 = flow.dispatch.workload.ordinal %arg3, 1 : index %3 = flow.dispatch.workload.ordinal %arg4, 2 : index %4 = flow.dispatch.workload.ordinal %arg5, 3 : index %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %7 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %5 : tensor to tensor<2x6x11x13xf32> %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After OutlineConstantsPass (iree-flow-outline-constants) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local flow.executable private @main_dispatch_0 { flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 flow.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %1 = flow.dispatch.workload.ordinal %arg2, 0 : index %2 = flow.dispatch.workload.ordinal %arg3, 1 : index %3 = flow.dispatch.workload.ordinal %arg4, 2 : index %4 = flow.dispatch.workload.ordinal %arg5, 3 : index %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %7 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %5 : tensor to tensor<2x6x11x13xf32> %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local flow.executable private @main_dispatch_0 { flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 flow.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %1 = flow.dispatch.workload.ordinal %arg2, 0 : index %2 = flow.dispatch.workload.ordinal %arg3, 1 : index %3 = flow.dispatch.workload.ordinal %arg4, 2 : index %4 = flow.dispatch.workload.ordinal %arg5, 3 : index %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %7 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %5 : tensor to tensor<2x6x11x13xf32> %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local flow.executable private @main_dispatch_0 { flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 flow.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %1 = flow.dispatch.workload.ordinal %arg2, 0 : index %2 = flow.dispatch.workload.ordinal %arg3, 1 : index %3 = flow.dispatch.workload.ordinal %arg4, 2 : index %4 = flow.dispatch.workload.ordinal %arg5, 3 : index %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %7 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %5 : tensor to tensor<2x6x11x13xf32> %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local flow.executable private @main_dispatch_0 { flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 flow.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %1 = flow.dispatch.workload.ordinal %arg2, 0 : index %2 = flow.dispatch.workload.ordinal %arg3, 1 : index %3 = flow.dispatch.workload.ordinal %arg4, 2 : index %4 = flow.dispatch.workload.ordinal %arg5, 3 : index %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %7 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %5 : tensor to tensor<2x6x11x13xf32> %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local flow.executable private @main_dispatch_0 { flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 flow.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %1 = flow.dispatch.workload.ordinal %arg2, 0 : index %2 = flow.dispatch.workload.ordinal %arg3, 1 : index %3 = flow.dispatch.workload.ordinal %arg4, 2 : index %4 = flow.dispatch.workload.ordinal %arg5, 3 : index %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %7 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %5 : tensor to tensor<2x6x11x13xf32> %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local flow.executable private @main_dispatch_0 { flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 flow.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %1 = flow.dispatch.workload.ordinal %arg2, 0 : index %2 = flow.dispatch.workload.ordinal %arg3, 1 : index %3 = flow.dispatch.workload.ordinal %arg4, 2 : index %4 = flow.dispatch.workload.ordinal %arg5, 3 : index %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %7 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %5 : tensor to tensor<2x6x11x13xf32> %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local flow.executable private @main_dispatch_0 { flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 flow.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %1 = flow.dispatch.workload.ordinal %arg2, 0 : index %2 = flow.dispatch.workload.ordinal %arg3, 1 : index %3 = flow.dispatch.workload.ordinal %arg4, 2 : index %4 = flow.dispatch.workload.ordinal %arg5, 3 : index %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %7 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %5 : tensor to tensor<2x6x11x13xf32> %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before IPO (iree-util-ipo) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local flow.executable private @main_dispatch_0 { flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 flow.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %1 = flow.dispatch.workload.ordinal %arg2, 0 : index %2 = flow.dispatch.workload.ordinal %arg3, 1 : index %3 = flow.dispatch.workload.ordinal %arg4, 2 : index %4 = flow.dispatch.workload.ordinal %arg5, 3 : index %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %7 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %5 : tensor to tensor<2x6x11x13xf32> %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After IPO (iree-util-ipo) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local flow.executable private @main_dispatch_0 { flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 flow.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %1 = flow.dispatch.workload.ordinal %arg2, 0 : index %2 = flow.dispatch.workload.ordinal %arg3, 1 : index %3 = flow.dispatch.workload.ordinal %arg4, 2 : index %4 = flow.dispatch.workload.ordinal %arg5, 3 : index %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %7 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %5 : tensor to tensor<2x6x11x13xf32> %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After FixedPointIterator (iree-util-fixed-point-iterator) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local flow.executable private @main_dispatch_0 { flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 flow.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %1 = flow.dispatch.workload.ordinal %arg2, 0 : index %2 = flow.dispatch.workload.ordinal %arg3, 1 : index %3 = flow.dispatch.workload.ordinal %arg4, 2 : index %4 = flow.dispatch.workload.ordinal %arg5, 3 : index %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %7 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %5 : tensor to tensor<2x6x11x13xf32> %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before SymbolDCE (symbol-dce) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local flow.executable private @main_dispatch_0 { flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 flow.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %1 = flow.dispatch.workload.ordinal %arg2, 0 : index %2 = flow.dispatch.workload.ordinal %arg3, 1 : index %3 = flow.dispatch.workload.ordinal %arg4, 2 : index %4 = flow.dispatch.workload.ordinal %arg5, 3 : index %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %7 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %5 : tensor to tensor<2x6x11x13xf32> %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After SymbolDCE (symbol-dce) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local flow.executable private @main_dispatch_0 { flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 flow.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %1 = flow.dispatch.workload.ordinal %arg2, 0 : index %2 = flow.dispatch.workload.ordinal %arg3, 1 : index %3 = flow.dispatch.workload.ordinal %arg4, 2 : index %4 = flow.dispatch.workload.ordinal %arg5, 3 : index %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %7 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %5 : tensor to tensor<2x6x11x13xf32> %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before VerifyInputPass (iree-stream-verify-input) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local flow.executable private @main_dispatch_0 { flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 flow.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %1 = flow.dispatch.workload.ordinal %arg2, 0 : index %2 = flow.dispatch.workload.ordinal %arg3, 1 : index %3 = flow.dispatch.workload.ordinal %arg4, 2 : index %4 = flow.dispatch.workload.ordinal %arg5, 3 : index %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %7 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %5 : tensor to tensor<2x6x11x13xf32> %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After VerifyInputPass (iree-stream-verify-input) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local flow.executable private @main_dispatch_0 { flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 flow.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %1 = flow.dispatch.workload.ordinal %arg2, 0 : index %2 = flow.dispatch.workload.ordinal %arg3, 1 : index %3 = flow.dispatch.workload.ordinal %arg4, 2 : index %4 = flow.dispatch.workload.ordinal %arg5, 3 : index %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %7 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %5 : tensor to tensor<2x6x11x13xf32> %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } // -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local flow.executable private @main_dispatch_0 { flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 flow.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %1 = flow.dispatch.workload.ordinal %arg2, 0 : index %2 = flow.dispatch.workload.ordinal %arg3, 1 : index %3 = flow.dispatch.workload.ordinal %arg4, 2 : index %4 = flow.dispatch.workload.ordinal %arg5, 3 : index %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %7 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %5 : tensor to tensor<2x6x11x13xf32> %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local flow.executable private @main_dispatch_0 { flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 flow.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %1 = flow.dispatch.workload.ordinal %arg2, 0 : index %2 = flow.dispatch.workload.ordinal %arg3, 1 : index %3 = flow.dispatch.workload.ordinal %arg4, 2 : index %4 = flow.dispatch.workload.ordinal %arg5, 3 : index %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %7 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %5 : tensor to tensor<2x6x11x13xf32> %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local flow.executable private @main_dispatch_0 { flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 flow.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %1 = flow.dispatch.workload.ordinal %arg2, 0 : index %2 = flow.dispatch.workload.ordinal %arg3, 1 : index %3 = flow.dispatch.workload.ordinal %arg4, 2 : index %4 = flow.dispatch.workload.ordinal %arg5, 3 : index %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %7 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %5 : tensor to tensor<2x6x11x13xf32> %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local flow.executable private @main_dispatch_0 { flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 flow.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %1 = flow.dispatch.workload.ordinal %arg2, 0 : index %2 = flow.dispatch.workload.ordinal %arg3, 1 : index %3 = flow.dispatch.workload.ordinal %arg4, 2 : index %4 = flow.dispatch.workload.ordinal %arg5, 3 : index %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %7 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %5 : tensor to tensor<2x6x11x13xf32> %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local flow.executable private @main_dispatch_0 { flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 flow.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %1 = flow.dispatch.workload.ordinal %arg2, 0 : index %2 = flow.dispatch.workload.ordinal %arg3, 1 : index %3 = flow.dispatch.workload.ordinal %arg4, 2 : index %4 = flow.dispatch.workload.ordinal %arg5, 3 : index %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %7 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %5 : tensor to tensor<2x6x11x13xf32> %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local flow.executable private @main_dispatch_0 { flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 flow.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %1 = flow.dispatch.workload.ordinal %arg2, 0 : index %2 = flow.dispatch.workload.ordinal %arg3, 1 : index %3 = flow.dispatch.workload.ordinal %arg4, 2 : index %4 = flow.dispatch.workload.ordinal %arg5, 3 : index %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %7 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %5 : tensor to tensor<2x6x11x13xf32> %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before IPO (iree-util-ipo) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local flow.executable private @main_dispatch_0 { flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 flow.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %1 = flow.dispatch.workload.ordinal %arg2, 0 : index %2 = flow.dispatch.workload.ordinal %arg3, 1 : index %3 = flow.dispatch.workload.ordinal %arg4, 2 : index %4 = flow.dispatch.workload.ordinal %arg5, 3 : index %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %7 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %5 : tensor to tensor<2x6x11x13xf32> %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After IPO (iree-util-ipo) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local flow.executable private @main_dispatch_0 { flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 flow.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %1 = flow.dispatch.workload.ordinal %arg2, 0 : index %2 = flow.dispatch.workload.ordinal %arg3, 1 : index %3 = flow.dispatch.workload.ordinal %arg4, 2 : index %4 = flow.dispatch.workload.ordinal %arg5, 3 : index %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %7 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %5 : tensor to tensor<2x6x11x13xf32> %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump Before ConvertToStreamPass (iree-stream-conversion) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local flow.executable private @main_dispatch_0 { flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 flow.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %1 = flow.dispatch.workload.ordinal %arg2, 0 : index %2 = flow.dispatch.workload.ordinal %arg3, 1 : index %3 = flow.dispatch.workload.ordinal %arg4, 2 : index %4 = flow.dispatch.workload.ordinal %arg5, 3 : index %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %7 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %5 : tensor to tensor<2x6x11x13xf32> %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor{%0, %1, %2, %3} %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32> %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view util.return %7 : !hal.buffer_view } } // -----// IR Dump After ConvertToStreamPass (iree-stream-conversion) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %cst = arith.constant 0.000000e+00 : f32 %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor{%0, %1, %2, %3} : index %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%4} %6 = stream.async.transfer %5 : !stream.resource{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4} %element_type_f32_0 = hal.element_type : i32 %dense_row_major_1 = hal.encoding_type : i32 %c4 = arith.constant 4 : index %c6_2 = arith.constant 6 : index %c5 = arith.constant 5 : index %c5_3 = arith.constant 5 : index hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6_2, %c5, %c5_3]) type(%element_type_f32_0) encoding(%dense_row_major_1) %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} %c0 = arith.constant 0 : index %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10} %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%10} %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource{%10} -> !hal.buffer_view util.return %13 : !hal.buffer_view } } // -----// IR Dump Before VerifyLoweringToTensorsPass (iree-stream-verify-lowering-to-tensors) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %cst = arith.constant 0.000000e+00 : f32 %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor{%0, %1, %2, %3} : index %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%4} %6 = stream.async.transfer %5 : !stream.resource{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4} %element_type_f32_0 = hal.element_type : i32 %dense_row_major_1 = hal.encoding_type : i32 %c4 = arith.constant 4 : index %c6_2 = arith.constant 6 : index %c5 = arith.constant 5 : index %c5_3 = arith.constant 5 : index hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6_2, %c5, %c5_3]) type(%element_type_f32_0) encoding(%dense_row_major_1) %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} %c0 = arith.constant 0 : index %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10} %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%10} %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource{%10} -> !hal.buffer_view util.return %13 : !hal.buffer_view } } // -----// IR Dump After VerifyLoweringToTensorsPass (iree-stream-verify-lowering-to-tensors) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %cst = arith.constant 0.000000e+00 : f32 %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor{%0, %1, %2, %3} : index %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%4} %6 = stream.async.transfer %5 : !stream.resource{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4} %element_type_f32_0 = hal.element_type : i32 %dense_row_major_1 = hal.encoding_type : i32 %c4 = arith.constant 4 : index %c6_2 = arith.constant 6 : index %c5 = arith.constant 5 : index %c5_3 = arith.constant 5 : index hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6_2, %c5, %c5_3]) type(%element_type_f32_0) encoding(%dense_row_major_1) %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} %c0 = arith.constant 0 : index %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10} %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%10} %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource{%10} -> !hal.buffer_view util.return %13 : !hal.buffer_view } } // -----// IR Dump Before Inliner (inline) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %cst = arith.constant 0.000000e+00 : f32 %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor{%0, %1, %2, %3} : index %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%4} %6 = stream.async.transfer %5 : !stream.resource{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4} %element_type_f32_0 = hal.element_type : i32 %dense_row_major_1 = hal.encoding_type : i32 %c4 = arith.constant 4 : index %c6_2 = arith.constant 6 : index %c5 = arith.constant 5 : index %c5_3 = arith.constant 5 : index hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6_2, %c5, %c5_3]) type(%element_type_f32_0) encoding(%dense_row_major_1) %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} %c0 = arith.constant 0 : index %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10} %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%10} %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource{%10} -> !hal.buffer_view util.return %13 : !hal.buffer_view } } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %cst = arith.constant 0.000000e+00 : f32 %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } // -----// IR Dump After Canonicalizer (canonicalize) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor{%0, %1, %2, %3} : index %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%4} %6 = stream.async.transfer %5 : !stream.resource{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4} %element_type_f32_0 = hal.element_type : i32 %dense_row_major_1 = hal.encoding_type : i32 %c4 = arith.constant 4 : index %c6_2 = arith.constant 6 : index %c5 = arith.constant 5 : index %c5_3 = arith.constant 5 : index hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6_2, %c5, %c5_3]) type(%element_type_f32_0) encoding(%dense_row_major_1) %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} %c0 = arith.constant 0 : index %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10} %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%10} %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource{%10} -> !hal.buffer_view util.return %13 : !hal.buffer_view } // -----// IR Dump After Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor{%0, %1, %2, %3} : index %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%4} %6 = stream.async.transfer %5 : !stream.resource{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4} %element_type_f32_0 = hal.element_type : i32 %dense_row_major_1 = hal.encoding_type : i32 hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32_0) encoding(%dense_row_major_1) %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10} %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%10} %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource{%10} -> !hal.buffer_view util.return %13 : !hal.buffer_view } // -----// IR Dump After Inliner (inline) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor{%0, %1, %2, %3} : index %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%4} %6 = stream.async.transfer %5 : !stream.resource{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4} %element_type_f32_0 = hal.element_type : i32 %dense_row_major_1 = hal.encoding_type : i32 hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32_0) encoding(%dense_row_major_1) %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10} %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%10} %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource{%10} -> !hal.buffer_view util.return %13 : !hal.buffer_view } } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor{%0, %1, %2, %3} : index %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%4} %6 = stream.async.transfer %5 : !stream.resource{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4} %element_type_f32_0 = hal.element_type : i32 %dense_row_major_1 = hal.encoding_type : i32 hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32_0) encoding(%dense_row_major_1) %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10} %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%10} %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource{%10} -> !hal.buffer_view util.return %13 : !hal.buffer_view } // -----// IR Dump After Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor{%0, %1, %2, %3} : index %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%4} %6 = stream.async.transfer %5 : !stream.resource{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4} %element_type_f32_0 = hal.element_type : i32 %dense_row_major_1 = hal.encoding_type : i32 hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32_0) encoding(%dense_row_major_1) %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10} %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%10} %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource{%10} -> !hal.buffer_view util.return %13 : !hal.buffer_view } // -----// IR Dump Before CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor{%0, %1, %2, %3} : index %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%4} %6 = stream.async.transfer %5 : !stream.resource{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4} %element_type_f32_0 = hal.element_type : i32 %dense_row_major_1 = hal.encoding_type : i32 hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32_0) encoding(%dense_row_major_1) %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10} %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%10} %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource{%10} -> !hal.buffer_view util.return %13 : !hal.buffer_view } // -----// IR Dump After CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor{%0, %1, %2, %3} : index %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%4} %6 = stream.async.transfer %5 : !stream.resource{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10} %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%10} %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource{%10} -> !hal.buffer_view util.return %13 : !hal.buffer_view } // -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor{%0, %1, %2, %3} : index %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%4} %6 = stream.async.transfer %5 : !stream.resource{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10} %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%10} %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource{%10} -> !hal.buffer_view util.return %13 : !hal.buffer_view } // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor{%0, %1, %2, %3} : index %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%4} %6 = stream.async.transfer %5 : !stream.resource{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10} %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%10} %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource{%10} -> !hal.buffer_view util.return %13 : !hal.buffer_view } // -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor{%0, %1, %2, %3} : index %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%4} %6 = stream.async.transfer %5 : !stream.resource{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10} %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%10} %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource{%10} -> !hal.buffer_view util.return %13 : !hal.buffer_view } } // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor{%0, %1, %2, %3} : index %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%4} %6 = stream.async.transfer %5 : !stream.resource{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10} %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%10} %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource{%10} -> !hal.buffer_view util.return %13 : !hal.buffer_view } } // -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor{%0, %1, %2, %3} : index %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%4} %6 = stream.async.transfer %5 : !stream.resource{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10} %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%10} %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource{%10} -> !hal.buffer_view util.return %13 : !hal.buffer_view } } // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor{%0, %1, %2, %3} : index %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%4} %6 = stream.async.transfer %5 : !stream.resource{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10} %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%10} %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource{%10} -> !hal.buffer_view util.return %13 : !hal.buffer_view } } // -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor{%0, %1, %2, %3} : index %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%4} %6 = stream.async.transfer %5 : !stream.resource{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10} %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%10} %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource{%10} -> !hal.buffer_view util.return %13 : !hal.buffer_view } } // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor{%0, %1, %2, %3} : index %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%4} %6 = stream.async.transfer %5 : !stream.resource{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10} %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%10} %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource{%10} -> !hal.buffer_view util.return %13 : !hal.buffer_view } } // -----// IR Dump Before IPO (iree-util-ipo) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor{%0, %1, %2, %3} : index %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%4} %6 = stream.async.transfer %5 : !stream.resource{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10} %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%10} %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource{%10} -> !hal.buffer_view util.return %13 : !hal.buffer_view } } // -----// IR Dump After IPO (iree-util-ipo) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor{%0, %1, %2, %3} : index %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%4} %6 = stream.async.transfer %5 : !stream.resource{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10} %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%10} %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource{%10} -> !hal.buffer_view util.return %13 : !hal.buffer_view } } // -----// IR Dump Before CombineInitializers (iree-util-combine-initializers) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor{%0, %1, %2, %3} : index %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%4} %6 = stream.async.transfer %5 : !stream.resource{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10} %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%10} %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource{%10} -> !hal.buffer_view util.return %13 : !hal.buffer_view } } // -----// IR Dump After CombineInitializers (iree-util-combine-initializers) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor{%0, %1, %2, %3} : index %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%4} %6 = stream.async.transfer %5 : !stream.resource{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10} %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%10} %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource{%10} -> !hal.buffer_view util.return %13 : !hal.buffer_view } } // -----// IR Dump Before EncodeDeviceTensorsPass (iree-stream-encode-device-tensors) //----- // stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } // -----// IR Dump After EncodeDeviceTensorsPass (iree-stream-encode-device-tensors) //----- // stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } // -----// IR Dump Before EncodeHostTensorsPass (iree-stream-encode-host-tensors) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor{%0, %1, %2, %3} : index %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%4} %6 = stream.async.transfer %5 : !stream.resource{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10} %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%10} %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource{%10} -> !hal.buffer_view util.return %13 : !hal.buffer_view } // -----// IR Dump After EncodeHostTensorsPass (iree-stream-encode-host-tensors) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %11 = stream.async.transfer %10 : !stream.resource{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400} %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016} %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%c2016} %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %14 : !hal.buffer_view } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %11 = stream.async.transfer %10 : !stream.resource{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400} %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016} %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%c2016} %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %14 : !hal.buffer_view } // -----// IR Dump After Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %11 = stream.async.transfer %10 : !stream.resource{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400} %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016} %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%c2016} %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %14 : !hal.buffer_view } // -----// IR Dump Before CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %11 = stream.async.transfer %10 : !stream.resource{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400} %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016} %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%c2016} %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %14 : !hal.buffer_view } // -----// IR Dump After CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %11 = stream.async.transfer %10 : !stream.resource{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400} %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016} %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%c2016} %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %14 : !hal.buffer_view } // -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %11 = stream.async.transfer %10 : !stream.resource{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400} %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016} %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%c2016} %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %14 : !hal.buffer_view } // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %11 = stream.async.transfer %10 : !stream.resource{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400} %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016} %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%c2016} %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %14 : !hal.buffer_view } // -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %11 = stream.async.transfer %10 : !stream.resource{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400} %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016} %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%c2016} %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %14 : !hal.buffer_view } } // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %11 = stream.async.transfer %10 : !stream.resource{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400} %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016} %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%c2016} %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %14 : !hal.buffer_view } } // -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %11 = stream.async.transfer %10 : !stream.resource{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400} %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016} %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%c2016} %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %14 : !hal.buffer_view } } // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %11 = stream.async.transfer %10 : !stream.resource{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400} %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016} %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%c2016} %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %14 : !hal.buffer_view } } // -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %11 = stream.async.transfer %10 : !stream.resource{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400} %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016} %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%c2016} %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %14 : !hal.buffer_view } } // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %11 = stream.async.transfer %10 : !stream.resource{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400} %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016} %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%c2016} %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %14 : !hal.buffer_view } } // -----// IR Dump Before IPO (iree-util-ipo) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %11 = stream.async.transfer %10 : !stream.resource{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400} %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016} %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%c2016} %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %14 : !hal.buffer_view } } // -----// IR Dump After IPO (iree-util-ipo) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %11 = stream.async.transfer %10 : !stream.resource{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400} %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016} %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%c2016} %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %14 : !hal.buffer_view } } // -----// IR Dump Before VerifyLoweringToAsyncResourcesPass (iree-stream-verify-lowering-to-async-resources) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %11 = stream.async.transfer %10 : !stream.resource{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400} %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016} %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%c2016} %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %14 : !hal.buffer_view } } // -----// IR Dump After VerifyLoweringToAsyncResourcesPass (iree-stream-verify-lowering-to-async-resources) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %11 = stream.async.transfer %10 : !stream.resource{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400} %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016} %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%c2016} %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %14 : !hal.buffer_view } } // -----// IR Dump Before MaterializeCopyOnWritePass (iree-stream-materialize-copy-on-write) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %11 = stream.async.transfer %10 : !stream.resource{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400} %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016} %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%c2016} %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %14 : !hal.buffer_view } // -----// IR Dump After MaterializeCopyOnWritePass (iree-stream-materialize-copy-on-write) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %11 = stream.async.transfer %10 : !stream.resource{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400} %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016} %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%c2016} %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %14 : !hal.buffer_view } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %11 = stream.async.transfer %10 : !stream.resource{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400} %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016} %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%c2016} %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %14 : !hal.buffer_view } // -----// IR Dump After Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %11 = stream.async.transfer %10 : !stream.resource{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400} %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016} %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%c2016} %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %14 : !hal.buffer_view } // -----// IR Dump Before ElideAsyncCopiesPass (iree-stream-elide-async-copies) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %11 = stream.async.transfer %10 : !stream.resource{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400} %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016} %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%c2016} %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %14 : !hal.buffer_view } } // -----// IR Dump After ElideAsyncCopiesPass (iree-stream-elide-async-copies) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %11 = stream.async.transfer %10 : !stream.resource{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400} %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016} %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%c2016} %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %14 : !hal.buffer_view } } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %11 = stream.async.transfer %10 : !stream.resource{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400} %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016} %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%c2016} %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %14 : !hal.buffer_view } // -----// IR Dump After Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %11 = stream.async.transfer %10 : !stream.resource{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400} %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016} %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%c2016} %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %14 : !hal.buffer_view } // -----// IR Dump Before EmplaceAllocationsPass (iree-stream-emplace-allocations) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %11 = stream.async.transfer %10 : !stream.resource{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400} %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016} %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%c2016} %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %14 : !hal.buffer_view } // -----// IR Dump After EmplaceAllocationsPass (iree-stream-emplace-allocations) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %11 = stream.async.transfer %10 : !stream.resource{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400} %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016} %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%c2016} %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %14 : !hal.buffer_view } // -----// IR Dump Before RefineUsagePass (iree-stream-refine-usage) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} %9 = stream.async.transfer %8 : !stream.resource{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %11 = stream.async.transfer %10 : !stream.resource{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400} %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016} %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%c2016} %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %14 : !hal.buffer_view } } // -----// IR Dump After RefineUsagePass (iree-stream-refine-usage) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } // -----// IR Dump After Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } // -----// IR Dump Before CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } // -----// IR Dump After CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } // -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } // -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } } // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } } // -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } } // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } } // -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } } // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } } // -----// IR Dump Before IPO (iree-util-ipo) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } } // -----// IR Dump After IPO (iree-util-ipo) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } } // -----// IR Dump Before VerifyAsyncAccessRangesPass (iree-stream-verify-async-access-ranges) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } } // -----// IR Dump After VerifyAsyncAccessRangesPass (iree-stream-verify-async-access-ranges) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } } // -----// IR Dump Before ScheduleExecutionPass (iree-stream-schedule-execution) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } // -----// IR Dump After ScheduleExecutionPass (iree-stream-schedule-execution) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}) -> !stream.resource{%c2016} { %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} stream.yield %12 : !stream.resource{%c2016} } => !stream.timepoint %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } // -----// IR Dump Before ScheduleConcurrencyPass (iree-stream-schedule-concurrency) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}) -> !stream.resource{%c2016} { %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} stream.yield %12 : !stream.resource{%c2016} } => !stream.timepoint %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } // -----// IR Dump After ScheduleConcurrencyPass (iree-stream-schedule-concurrency) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}) -> !stream.resource{%c2016} { %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} stream.yield %12 : !stream.resource{%c2016} } => !stream.timepoint %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } // -----// IR Dump Before PropagateTimepointsPass (iree-stream-propagate-timepoints) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}) -> !stream.resource{%c2016} { %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} stream.yield %12 : !stream.resource{%c2016} } => !stream.timepoint %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } } // -----// IR Dump After PropagateTimepointsPass (iree-stream-propagate-timepoints) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %10 = stream.timepoint.immediate => !stream.timepoint %11 = stream.timepoint.immediate => !stream.timepoint %12 = stream.timepoint.join max(%10, %11) => !stream.timepoint %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%12) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}) -> !stream.resource{%c2016} { %15 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} stream.yield %15 : !stream.resource{%c2016} } => !stream.timepoint %13 = stream.timepoint.await %result_timepoint => %results : !stream.resource{%c2016} %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %14 : !hal.buffer_view } } // -----// IR Dump Before MaterializeBuiltinsPass (iree-stream-materialize-builtins) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %10 = stream.timepoint.immediate => !stream.timepoint %11 = stream.timepoint.immediate => !stream.timepoint %12 = stream.timepoint.join max(%10, %11) => !stream.timepoint %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%12) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}) -> !stream.resource{%c2016} { %15 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} stream.yield %15 : !stream.resource{%c2016} } => !stream.timepoint %13 = stream.timepoint.await %result_timepoint => %results : !stream.resource{%c2016} %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %14 : !hal.buffer_view } } // -----// IR Dump After MaterializeBuiltinsPass (iree-stream-materialize-builtins) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %10 = stream.timepoint.immediate => !stream.timepoint %11 = stream.timepoint.immediate => !stream.timepoint %12 = stream.timepoint.join max(%10, %11) => !stream.timepoint %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%12) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}) -> !stream.resource{%c2016} { %15 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} stream.yield %15 : !stream.resource{%c2016} } => !stream.timepoint %13 = stream.timepoint.await %result_timepoint => %results : !stream.resource{%c2016} %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %14 : !hal.buffer_view } } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %10 = stream.timepoint.immediate => !stream.timepoint %11 = stream.timepoint.immediate => !stream.timepoint %12 = stream.timepoint.join max(%10, %11) => !stream.timepoint %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%12) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}) -> !stream.resource{%c2016} { %15 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} stream.yield %15 : !stream.resource{%c2016} } => !stream.timepoint %13 = stream.timepoint.await %result_timepoint => %results : !stream.resource{%c2016} %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %14 : !hal.buffer_view } // -----// IR Dump After Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}) -> !stream.resource{%c2016} { %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} stream.yield %12 : !stream.resource{%c2016} } => !stream.timepoint %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } // -----// IR Dump Before CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}) -> !stream.resource{%c2016} { %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} stream.yield %12 : !stream.resource{%c2016} } => !stream.timepoint %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } // -----// IR Dump After CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}) -> !stream.resource{%c2016} { %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} stream.yield %12 : !stream.resource{%c2016} } => !stream.timepoint %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } // -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}) -> !stream.resource{%c2016} { %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} stream.yield %12 : !stream.resource{%c2016} } => !stream.timepoint %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}) -> !stream.resource{%c2016} { %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} stream.yield %12 : !stream.resource{%c2016} } => !stream.timepoint %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } // -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}) -> !stream.resource{%c2016} { %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} stream.yield %12 : !stream.resource{%c2016} } => !stream.timepoint %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } } // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}) -> !stream.resource{%c2016} { %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} stream.yield %12 : !stream.resource{%c2016} } => !stream.timepoint %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } } // -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}) -> !stream.resource{%c2016} { %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} stream.yield %12 : !stream.resource{%c2016} } => !stream.timepoint %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } } // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}) -> !stream.resource{%c2016} { %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} stream.yield %12 : !stream.resource{%c2016} } => !stream.timepoint %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } } // -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}) -> !stream.resource{%c2016} { %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} stream.yield %12 : !stream.resource{%c2016} } => !stream.timepoint %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } } // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}) -> !stream.resource{%c2016} { %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} stream.yield %12 : !stream.resource{%c2016} } => !stream.timepoint %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } } // -----// IR Dump Before IPO (iree-util-ipo) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}) -> !stream.resource{%c2016} { %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} stream.yield %12 : !stream.resource{%c2016} } => !stream.timepoint %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } } // -----// IR Dump After IPO (iree-util-ipo) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}) -> !stream.resource{%c2016} { %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} stream.yield %12 : !stream.resource{%c2016} } => !stream.timepoint %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } } // -----// IR Dump Before VerifyLoweringToAsyncPass (iree-stream-verify-lowering-to-async) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}) -> !stream.resource{%c2016} { %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} stream.yield %12 : !stream.resource{%c2016} } => !stream.timepoint %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } } // -----// IR Dump After VerifyLoweringToAsyncPass (iree-stream-verify-lowering-to-async) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}) -> !stream.resource{%c2016} { %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} stream.yield %12 : !stream.resource{%c2016} } => !stream.timepoint %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } } // -----// IR Dump Before ScheduleAllocationPass (iree-stream-schedule-allocation) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}) -> !stream.resource{%c2016} { %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource{%7}, !stream.resource{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource{%c2016} stream.yield %12 : !stream.resource{%c2016} } => !stream.timepoint %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource{%c2016} %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %11 : !hal.buffer_view } } // -----// IR Dump After ScheduleAllocationPass (iree-stream-schedule-allocation) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %c0_0 = arith.constant 0 : index %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0_0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump Before PackConstantsPass (iree-stream-pack-constants) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %c0_0 = arith.constant 0 : index %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0_0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } // -----// IR Dump After PackConstantsPass (iree-stream-pack-constants) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %c0_0 = arith.constant 0 : index %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0_0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } // -----// IR Dump Before LayoutSlicesPass (iree-stream-layout-slices) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %c0_0 = arith.constant 0 : index %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0_0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } // -----// IR Dump After LayoutSlicesPass (iree-stream-layout-slices) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %c0_0 = arith.constant 0 : index %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0_0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } // -----// IR Dump Before PropagateSubranges (iree-util-propagate-subranges) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %c0_0 = arith.constant 0 : index %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0_0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump After PropagateSubranges (iree-util-propagate-subranges) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %c0_0 = arith.constant 0 : index %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0_0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %c0_0 = arith.constant 0 : index %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0_0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } // -----// IR Dump After Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } // -----// IR Dump Before CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } // -----// IR Dump After CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } // -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } // -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump Before IPO (iree-util-ipo) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump After IPO (iree-util-ipo) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump Before VerifyLoweringToCmdPass (iree-stream-verify-lowering-to-cmd) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump After VerifyLoweringToCmdPass (iree-stream-verify-lowering-to-cmd) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } // -----// IR Dump After Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } // -----// IR Dump Before CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } // -----// IR Dump After CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } // -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } // -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump Before IPO (iree-util-ipo) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump After IPO (iree-util-ipo) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump Before SCFToControlFlow (convert-scf-to-cf) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } // -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } // -----// IR Dump Before FixedPointIterator (iree-util-fixed-point-iterator) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } // -----// IR Dump After Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } // -----// IR Dump Before CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } // -----// IR Dump After CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } // -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } // -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump Before IPO (iree-util-ipo) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump After IPO (iree-util-ipo) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump Before ElideTimepointsPass (iree-stream-elide-timepoints) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump After ElideTimepointsPass (iree-stream-elide-timepoints) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump After FixedPointIterator (iree-util-fixed-point-iterator) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump Before FuseDispatchBindingsPass (iree-stream-fuse-dispatch-bindings) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} %3 = flow.dispatch.workload.ordinal %arg2, 0 : index %4 = flow.dispatch.workload.ordinal %arg3, 1 : index %5 = flow.dispatch.workload.ordinal %arg4, 2 : index %6 = flow.dispatch.workload.ordinal %arg5, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg6, %arg7, %arg8, %arg9} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump After FuseDispatchBindingsPass (iree-stream-fuse-dispatch-bindings) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index, %arg12: index, %arg13: index) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%arg4] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg2[%arg5] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%arg3] : !stream.binding -> !flow.dispatch.tensor>{%arg10, %arg11, %arg12, %arg13} %3 = flow.dispatch.workload.ordinal %arg6, 0 : index %4 = flow.dispatch.workload.ordinal %arg7, 1 : index %5 = flow.dispatch.workload.ordinal %arg8, 2 : index %6 = flow.dispatch.workload.ordinal %arg9, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg10, %arg11, %arg12, %arg13} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %c0_0 = arith.constant 0 : index %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0, %c0, %c0, %c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index, index, index, index) { ro %arg2[%c0_0 for %7] : !stream.resource{%7}, ro %arg3[%c0_0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0_0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump Before AnnotateDispatchArgumentsPass (iree-stream-annotate-dispatch-arguments) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index, %arg12: index, %arg13: index) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%arg4] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg2[%arg5] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%arg3] : !stream.binding -> !flow.dispatch.tensor>{%arg10, %arg11, %arg12, %arg13} %3 = flow.dispatch.workload.ordinal %arg6, 0 : index %4 = flow.dispatch.workload.ordinal %arg7, 1 : index %5 = flow.dispatch.workload.ordinal %arg8, 2 : index %6 = flow.dispatch.workload.ordinal %arg9, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg10, %arg11, %arg12, %arg13} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %c0_0 = arith.constant 0 : index %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0, %c0, %c0, %c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index, index, index, index) { ro %arg2[%c0_0 for %7] : !stream.resource{%7}, ro %arg3[%c0_0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0_0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump After AnnotateDispatchArgumentsPass (iree-stream-annotate-dispatch-arguments) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: index {stream.values = [0 : index]}, %arg4: index {stream.values = [0 : index]}, %arg5: index {stream.values = [0 : index]}, %arg6: index {stream.alignment = 2 : index, stream.values = [2 : index]}, %arg7: index {stream.alignment = 2 : index, stream.values = [6 : index]}, %arg8: index {stream.values = [11 : index]}, %arg9: index {stream.values = [13 : index]}, %arg10: index, %arg11: index, %arg12: index, %arg13: index) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%arg4] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg2[%arg5] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%arg3] : !stream.binding -> !flow.dispatch.tensor>{%arg10, %arg11, %arg12, %arg13} %3 = flow.dispatch.workload.ordinal %arg6, 0 : index %4 = flow.dispatch.workload.ordinal %arg7, 1 : index %5 = flow.dispatch.workload.ordinal %arg8, 2 : index %6 = flow.dispatch.workload.ordinal %arg9, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg10, %arg11, %arg12, %arg13} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %c0_0 = arith.constant 0 : index %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0, %c0, %c0, %c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index, index, index, index) { ro %arg2[%c0_0 for %7] : !stream.resource{%7}, ro %arg3[%c0_0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0_0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump Before PackDispatchOperandsPass (iree-stream-pack-dispatch-operands) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: index {stream.values = [0 : index]}, %arg4: index {stream.values = [0 : index]}, %arg5: index {stream.values = [0 : index]}, %arg6: index {stream.alignment = 2 : index, stream.values = [2 : index]}, %arg7: index {stream.alignment = 2 : index, stream.values = [6 : index]}, %arg8: index {stream.values = [11 : index]}, %arg9: index {stream.values = [13 : index]}, %arg10: index, %arg11: index, %arg12: index, %arg13: index) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg1[%arg4] : !stream.binding -> !flow.dispatch.tensor> %1 = stream.binding.subspan %arg2[%arg5] : !stream.binding -> !flow.dispatch.tensor> %2 = stream.binding.subspan %arg0[%arg3] : !stream.binding -> !flow.dispatch.tensor>{%arg10, %arg11, %arg12, %arg13} %3 = flow.dispatch.workload.ordinal %arg6, 0 : index %4 = flow.dispatch.workload.ordinal %arg7, 1 : index %5 = flow.dispatch.workload.ordinal %arg8, 2 : index %6 = flow.dispatch.workload.ordinal %arg9, 3 : index %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%arg10, %arg11, %arg12, %arg13} -> tensor %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %9 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %7 : tensor to tensor<2x6x11x13xf32> %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %c0_0 = arith.constant 0 : index %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0, %c0, %c0, %c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index, index, index, index) { ro %arg2[%c0_0 for %7] : !stream.resource{%7}, ro %arg3[%c0_0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0_0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %11 = stream.timepoint.await %10 => %result : !stream.resource{%c2016} %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %12 : !hal.buffer_view } } // -----// IR Dump After PackDispatchOperandsPass (iree-stream-pack-dispatch-operands) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32, %arg12: i32, %arg13: i32, %arg14: i32, %arg15: i32, %arg16: i32, %arg17: i32, %arg18: i32, %arg19: i32, %arg20: i32, %arg21: i32, %arg22: i32, %arg23: i32, %arg24: i32) { %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %c32_i64 = arith.constant 32 : i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %c32_i64_0 = arith.constant 32 : i64 %7 = arith.shli %6, %c32_i64_0 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %c32_i64_1 = arith.constant 32 : i64 %12 = arith.shli %11, %c32_i64_1 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %c32_i64_2 = arith.constant 32 : i64 %17 = arith.shli %16, %c32_i64_2 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 {stream.alignment = 2 : index, stream.values = [2 : index]} : i64 to index %20 = arith.extui %arg11 : i32 to i64 %21 = arith.extui %arg12 : i32 to i64 %c32_i64_3 = arith.constant 32 : i64 %22 = arith.shli %21, %c32_i64_3 : i64 %23 = arith.ori %20, %22 : i64 %24 = arith.index_castui %23 {stream.alignment = 2 : index, stream.values = [6 : index]} : i64 to index %25 = arith.extui %arg13 : i32 to i64 %26 = arith.extui %arg14 : i32 to i64 %c32_i64_4 = arith.constant 32 : i64 %27 = arith.shli %26, %c32_i64_4 : i64 %28 = arith.ori %25, %27 : i64 %29 = arith.index_castui %28 {stream.values = [11 : index]} : i64 to index %30 = arith.extui %arg15 : i32 to i64 %31 = arith.extui %arg16 : i32 to i64 %c32_i64_5 = arith.constant 32 : i64 %32 = arith.shli %31, %c32_i64_5 : i64 %33 = arith.ori %30, %32 : i64 %34 = arith.index_castui %33 {stream.values = [13 : index]} : i64 to index %35 = arith.extui %arg17 : i32 to i64 %36 = arith.extui %arg18 : i32 to i64 %c32_i64_6 = arith.constant 32 : i64 %37 = arith.shli %36, %c32_i64_6 : i64 %38 = arith.ori %35, %37 : i64 %39 = arith.index_castui %38 : i64 to index %40 = arith.extui %arg19 : i32 to i64 %41 = arith.extui %arg20 : i32 to i64 %c32_i64_7 = arith.constant 32 : i64 %42 = arith.shli %41, %c32_i64_7 : i64 %43 = arith.ori %40, %42 : i64 %44 = arith.index_castui %43 : i64 to index %45 = arith.extui %arg21 : i32 to i64 %46 = arith.extui %arg22 : i32 to i64 %c32_i64_8 = arith.constant 32 : i64 %47 = arith.shli %46, %c32_i64_8 : i64 %48 = arith.ori %45, %47 : i64 %49 = arith.index_castui %48 : i64 to index %50 = arith.extui %arg23 : i32 to i64 %51 = arith.extui %arg24 : i32 to i64 %c32_i64_9 = arith.constant 32 : i64 %52 = arith.shli %51, %c32_i64_9 : i64 %53 = arith.ori %50, %52 : i64 %54 = arith.index_castui %53 : i64 to index %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %55 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor> %56 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor> %57 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor>{%39, %44, %49, %54} %58 = flow.dispatch.workload.ordinal %19, 0 : index %59 = flow.dispatch.workload.ordinal %24, 1 : index %60 = flow.dispatch.workload.ordinal %29, 2 : index %61 = flow.dispatch.workload.ordinal %34, 3 : index %62 = flow.dispatch.tensor.load %57, offsets = [0, 0, 0, 0], sizes = [%58, %59, %60, %61], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%39, %44, %49, %54} -> tensor %63 = flow.dispatch.tensor.load %55, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %64 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %62 : tensor to tensor<2x6x11x13xf32> %65 = linalg.fill ins(%cst : f32) outs(%64 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %66 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %63 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%65 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %66, %56, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %c0_0 = arith.constant 0 : index %c0_i64 = arith.constant 0 : i64 %c0_i32 = arith.constant 0 : i32 %c32_i64 = arith.constant 32 : i64 %c0_i64_1 = arith.constant 0 : i64 %c0_i32_2 = arith.constant 0 : i32 %c0_i64_3 = arith.constant 0 : i64 %c0_i32_4 = arith.constant 0 : i32 %c32_i64_5 = arith.constant 32 : i64 %c0_i64_6 = arith.constant 0 : i64 %c0_i32_7 = arith.constant 0 : i32 %c0_i64_8 = arith.constant 0 : i64 %c0_i32_9 = arith.constant 0 : i32 %c32_i64_10 = arith.constant 32 : i64 %c0_i64_11 = arith.constant 0 : i64 %c0_i32_12 = arith.constant 0 : i32 %c2_i64 = arith.constant 2 : i64 %c2_i32 = arith.constant 2 : i32 %c32_i64_13 = arith.constant 32 : i64 %c0_i64_14 = arith.constant 0 : i64 %c0_i32_15 = arith.constant 0 : i32 %c6_i64 = arith.constant 6 : i64 %c6_i32 = arith.constant 6 : i32 %c32_i64_16 = arith.constant 32 : i64 %c0_i64_17 = arith.constant 0 : i64 %c0_i32_18 = arith.constant 0 : i32 %c11_i64 = arith.constant 11 : i64 %c11_i32 = arith.constant 11 : i32 %c32_i64_19 = arith.constant 32 : i64 %c0_i64_20 = arith.constant 0 : i64 %c0_i32_21 = arith.constant 0 : i32 %c13_i64 = arith.constant 13 : i64 %c13_i32 = arith.constant 13 : i32 %c32_i64_22 = arith.constant 32 : i64 %c0_i64_23 = arith.constant 0 : i64 %c0_i32_24 = arith.constant 0 : i32 %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %c32_i64_25 = arith.constant 32 : i64 %12 = arith.shrui %10, %c32_i64_25 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %c32_i64_26 = arith.constant 32 : i64 %16 = arith.shrui %14, %c32_i64_26 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %c32_i64_27 = arith.constant 32 : i64 %20 = arith.shrui %18, %c32_i64_27 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %c32_i64_28 = arith.constant 32 : i64 %24 = arith.shrui %22, %c32_i64_28 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0_i32, %c0_i32_2, %c0_i32_4, %c0_i32_7, %c0_i32_9, %c0_i32_12, %c2_i32, %c0_i32_15, %c6_i32, %c0_i32_18, %c11_i32, %c0_i32_21, %c13_i32, %c0_i32_24, %11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0_0 for %7] : !stream.resource{%7}, ro %arg3[%c0_0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0_0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %c0_0 = arith.constant 0 : index %c0_i64 = arith.constant 0 : i64 %c0_i32 = arith.constant 0 : i32 %c32_i64 = arith.constant 32 : i64 %c0_i64_1 = arith.constant 0 : i64 %c0_i32_2 = arith.constant 0 : i32 %c0_i64_3 = arith.constant 0 : i64 %c0_i32_4 = arith.constant 0 : i32 %c32_i64_5 = arith.constant 32 : i64 %c0_i64_6 = arith.constant 0 : i64 %c0_i32_7 = arith.constant 0 : i32 %c0_i64_8 = arith.constant 0 : i64 %c0_i32_9 = arith.constant 0 : i32 %c32_i64_10 = arith.constant 32 : i64 %c0_i64_11 = arith.constant 0 : i64 %c0_i32_12 = arith.constant 0 : i32 %c2_i64 = arith.constant 2 : i64 %c2_i32 = arith.constant 2 : i32 %c32_i64_13 = arith.constant 32 : i64 %c0_i64_14 = arith.constant 0 : i64 %c0_i32_15 = arith.constant 0 : i32 %c6_i64 = arith.constant 6 : i64 %c6_i32 = arith.constant 6 : i32 %c32_i64_16 = arith.constant 32 : i64 %c0_i64_17 = arith.constant 0 : i64 %c0_i32_18 = arith.constant 0 : i32 %c11_i64 = arith.constant 11 : i64 %c11_i32 = arith.constant 11 : i32 %c32_i64_19 = arith.constant 32 : i64 %c0_i64_20 = arith.constant 0 : i64 %c0_i32_21 = arith.constant 0 : i32 %c13_i64 = arith.constant 13 : i64 %c13_i32 = arith.constant 13 : i32 %c32_i64_22 = arith.constant 32 : i64 %c0_i64_23 = arith.constant 0 : i64 %c0_i32_24 = arith.constant 0 : i32 %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %c32_i64_25 = arith.constant 32 : i64 %12 = arith.shrui %10, %c32_i64_25 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %c32_i64_26 = arith.constant 32 : i64 %16 = arith.shrui %14, %c32_i64_26 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %c32_i64_27 = arith.constant 32 : i64 %20 = arith.shrui %18, %c32_i64_27 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %c32_i64_28 = arith.constant 32 : i64 %24 = arith.shrui %22, %c32_i64_28 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0_i32, %c0_i32_2, %c0_i32_4, %c0_i32_7, %c0_i32_9, %c0_i32_12, %c2_i32, %c0_i32_15, %c6_i32, %c0_i32_18, %c11_i32, %c0_i32_21, %c13_i32, %c0_i32_24, %11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0_0 for %7] : !stream.resource{%7}, ro %arg3[%c0_0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0_0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } // -----// IR Dump After Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13_i32 = arith.constant 13 : i32 %c11_i32 = arith.constant 11 : i32 %c6_i32 = arith.constant 6 : i32 %c2_i32 = arith.constant 2 : i32 %c32_i64 = arith.constant 32 : i64 %c0_i32 = arith.constant 0 : i32 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c2_i32, %c0_i32, %c6_i32, %c0_i32, %c11_i32, %c0_i32, %c13_i32, %c0_i32, %11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } // -----// IR Dump Before CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13_i32 = arith.constant 13 : i32 %c11_i32 = arith.constant 11 : i32 %c6_i32 = arith.constant 6 : i32 %c2_i32 = arith.constant 2 : i32 %c32_i64 = arith.constant 32 : i64 %c0_i32 = arith.constant 0 : i32 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c2_i32, %c0_i32, %c6_i32, %c0_i32, %c11_i32, %c0_i32, %c13_i32, %c0_i32, %11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } // -----// IR Dump After CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13_i32 = arith.constant 13 : i32 %c11_i32 = arith.constant 11 : i32 %c6_i32 = arith.constant 6 : i32 %c2_i32 = arith.constant 2 : i32 %c32_i64 = arith.constant 32 : i64 %c0_i32 = arith.constant 0 : i32 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c2_i32, %c0_i32, %c6_i32, %c0_i32, %c11_i32, %c0_i32, %c13_i32, %c0_i32, %11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } // -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13_i32 = arith.constant 13 : i32 %c11_i32 = arith.constant 11 : i32 %c6_i32 = arith.constant 6 : i32 %c2_i32 = arith.constant 2 : i32 %c32_i64 = arith.constant 32 : i64 %c0_i32 = arith.constant 0 : i32 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c2_i32, %c0_i32, %c6_i32, %c0_i32, %c11_i32, %c0_i32, %c13_i32, %c0_i32, %11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13_i32 = arith.constant 13 : i32 %c11_i32 = arith.constant 11 : i32 %c6_i32 = arith.constant 6 : i32 %c2_i32 = arith.constant 2 : i32 %c32_i64 = arith.constant 32 : i64 %c0_i32 = arith.constant 0 : i32 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c2_i32, %c0_i32, %c6_i32, %c0_i32, %c11_i32, %c0_i32, %c13_i32, %c0_i32, %11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } // -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32, %arg12: i32, %arg13: i32, %arg14: i32, %arg15: i32, %arg16: i32, %arg17: i32, %arg18: i32, %arg19: i32, %arg20: i32, %arg21: i32, %arg22: i32, %arg23: i32, %arg24: i32) { %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %c32_i64 = arith.constant 32 : i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %c32_i64_0 = arith.constant 32 : i64 %7 = arith.shli %6, %c32_i64_0 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %c32_i64_1 = arith.constant 32 : i64 %12 = arith.shli %11, %c32_i64_1 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %c32_i64_2 = arith.constant 32 : i64 %17 = arith.shli %16, %c32_i64_2 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 {stream.alignment = 2 : index, stream.values = [2 : index]} : i64 to index %20 = arith.extui %arg11 : i32 to i64 %21 = arith.extui %arg12 : i32 to i64 %c32_i64_3 = arith.constant 32 : i64 %22 = arith.shli %21, %c32_i64_3 : i64 %23 = arith.ori %20, %22 : i64 %24 = arith.index_castui %23 {stream.alignment = 2 : index, stream.values = [6 : index]} : i64 to index %25 = arith.extui %arg13 : i32 to i64 %26 = arith.extui %arg14 : i32 to i64 %c32_i64_4 = arith.constant 32 : i64 %27 = arith.shli %26, %c32_i64_4 : i64 %28 = arith.ori %25, %27 : i64 %29 = arith.index_castui %28 {stream.values = [11 : index]} : i64 to index %30 = arith.extui %arg15 : i32 to i64 %31 = arith.extui %arg16 : i32 to i64 %c32_i64_5 = arith.constant 32 : i64 %32 = arith.shli %31, %c32_i64_5 : i64 %33 = arith.ori %30, %32 : i64 %34 = arith.index_castui %33 {stream.values = [13 : index]} : i64 to index %35 = arith.extui %arg17 : i32 to i64 %36 = arith.extui %arg18 : i32 to i64 %c32_i64_6 = arith.constant 32 : i64 %37 = arith.shli %36, %c32_i64_6 : i64 %38 = arith.ori %35, %37 : i64 %39 = arith.index_castui %38 : i64 to index %40 = arith.extui %arg19 : i32 to i64 %41 = arith.extui %arg20 : i32 to i64 %c32_i64_7 = arith.constant 32 : i64 %42 = arith.shli %41, %c32_i64_7 : i64 %43 = arith.ori %40, %42 : i64 %44 = arith.index_castui %43 : i64 to index %45 = arith.extui %arg21 : i32 to i64 %46 = arith.extui %arg22 : i32 to i64 %c32_i64_8 = arith.constant 32 : i64 %47 = arith.shli %46, %c32_i64_8 : i64 %48 = arith.ori %45, %47 : i64 %49 = arith.index_castui %48 : i64 to index %50 = arith.extui %arg23 : i32 to i64 %51 = arith.extui %arg24 : i32 to i64 %c32_i64_9 = arith.constant 32 : i64 %52 = arith.shli %51, %c32_i64_9 : i64 %53 = arith.ori %50, %52 : i64 %54 = arith.index_castui %53 : i64 to index %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %55 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor> %56 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor> %57 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor>{%39, %44, %49, %54} %58 = flow.dispatch.workload.ordinal %19, 0 : index %59 = flow.dispatch.workload.ordinal %24, 1 : index %60 = flow.dispatch.workload.ordinal %29, 2 : index %61 = flow.dispatch.workload.ordinal %34, 3 : index %62 = flow.dispatch.tensor.load %57, offsets = [0, 0, 0, 0], sizes = [%58, %59, %60, %61], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%39, %44, %49, %54} -> tensor %63 = flow.dispatch.tensor.load %55, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %64 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %62 : tensor to tensor<2x6x11x13xf32> %65 = linalg.fill ins(%cst : f32) outs(%64 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %66 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %63 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%65 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %66, %56, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13_i32 = arith.constant 13 : i32 %c11_i32 = arith.constant 11 : i32 %c6_i32 = arith.constant 6 : i32 %c2_i32 = arith.constant 2 : i32 %c32_i64 = arith.constant 32 : i64 %c0_i32 = arith.constant 0 : i32 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c2_i32, %c0_i32, %c6_i32, %c0_i32, %c11_i32, %c0_i32, %c13_i32, %c0_i32, %11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32, %arg12: i32, %arg13: i32, %arg14: i32, %arg15: i32, %arg16: i32, %arg17: i32, %arg18: i32, %arg19: i32, %arg20: i32, %arg21: i32, %arg22: i32, %arg23: i32, %arg24: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 {stream.alignment = 2 : index, stream.values = [2 : index]} : i64 to index %20 = arith.extui %arg11 : i32 to i64 %21 = arith.extui %arg12 : i32 to i64 %22 = arith.shli %21, %c32_i64 : i64 %23 = arith.ori %20, %22 : i64 %24 = arith.index_castui %23 {stream.alignment = 2 : index, stream.values = [6 : index]} : i64 to index %25 = arith.extui %arg13 : i32 to i64 %26 = arith.extui %arg14 : i32 to i64 %27 = arith.shli %26, %c32_i64 : i64 %28 = arith.ori %25, %27 : i64 %29 = arith.index_castui %28 {stream.values = [11 : index]} : i64 to index %30 = arith.extui %arg15 : i32 to i64 %31 = arith.extui %arg16 : i32 to i64 %32 = arith.shli %31, %c32_i64 : i64 %33 = arith.ori %30, %32 : i64 %34 = arith.index_castui %33 {stream.values = [13 : index]} : i64 to index %35 = arith.extui %arg17 : i32 to i64 %36 = arith.extui %arg18 : i32 to i64 %37 = arith.shli %36, %c32_i64 : i64 %38 = arith.ori %35, %37 : i64 %39 = arith.index_castui %38 : i64 to index %40 = arith.extui %arg19 : i32 to i64 %41 = arith.extui %arg20 : i32 to i64 %42 = arith.shli %41, %c32_i64 : i64 %43 = arith.ori %40, %42 : i64 %44 = arith.index_castui %43 : i64 to index %45 = arith.extui %arg21 : i32 to i64 %46 = arith.extui %arg22 : i32 to i64 %47 = arith.shli %46, %c32_i64 : i64 %48 = arith.ori %45, %47 : i64 %49 = arith.index_castui %48 : i64 to index %50 = arith.extui %arg23 : i32 to i64 %51 = arith.extui %arg24 : i32 to i64 %52 = arith.shli %51, %c32_i64 : i64 %53 = arith.ori %50, %52 : i64 %54 = arith.index_castui %53 : i64 to index %55 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor> %56 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor> %57 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor>{%39, %44, %49, %54} %58 = flow.dispatch.workload.ordinal %19, 0 : index %59 = flow.dispatch.workload.ordinal %24, 1 : index %60 = flow.dispatch.workload.ordinal %29, 2 : index %61 = flow.dispatch.workload.ordinal %34, 3 : index %62 = flow.dispatch.tensor.load %57, offsets = [0, 0, 0, 0], sizes = [%58, %59, %60, %61], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%39, %44, %49, %54} -> tensor %63 = flow.dispatch.tensor.load %55, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %64 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %62 : tensor to tensor<2x6x11x13xf32> %65 = linalg.fill ins(%cst : f32) outs(%64 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %66 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %63 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%65 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %66, %56, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13_i32 = arith.constant 13 : i32 %c11_i32 = arith.constant 11 : i32 %c6_i32 = arith.constant 6 : i32 %c2_i32 = arith.constant 2 : i32 %c32_i64 = arith.constant 32 : i64 %c0_i32 = arith.constant 0 : i32 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c2_i32, %c0_i32, %c6_i32, %c0_i32, %c11_i32, %c0_i32, %c13_i32, %c0_i32, %11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32, %arg12: i32, %arg13: i32, %arg14: i32, %arg15: i32, %arg16: i32, %arg17: i32, %arg18: i32, %arg19: i32, %arg20: i32, %arg21: i32, %arg22: i32, %arg23: i32, %arg24: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 {stream.alignment = 2 : index, stream.values = [2 : index]} : i64 to index %20 = arith.extui %arg11 : i32 to i64 %21 = arith.extui %arg12 : i32 to i64 %22 = arith.shli %21, %c32_i64 : i64 %23 = arith.ori %20, %22 : i64 %24 = arith.index_castui %23 {stream.alignment = 2 : index, stream.values = [6 : index]} : i64 to index %25 = arith.extui %arg13 : i32 to i64 %26 = arith.extui %arg14 : i32 to i64 %27 = arith.shli %26, %c32_i64 : i64 %28 = arith.ori %25, %27 : i64 %29 = arith.index_castui %28 {stream.values = [11 : index]} : i64 to index %30 = arith.extui %arg15 : i32 to i64 %31 = arith.extui %arg16 : i32 to i64 %32 = arith.shli %31, %c32_i64 : i64 %33 = arith.ori %30, %32 : i64 %34 = arith.index_castui %33 {stream.values = [13 : index]} : i64 to index %35 = arith.extui %arg17 : i32 to i64 %36 = arith.extui %arg18 : i32 to i64 %37 = arith.shli %36, %c32_i64 : i64 %38 = arith.ori %35, %37 : i64 %39 = arith.index_castui %38 : i64 to index %40 = arith.extui %arg19 : i32 to i64 %41 = arith.extui %arg20 : i32 to i64 %42 = arith.shli %41, %c32_i64 : i64 %43 = arith.ori %40, %42 : i64 %44 = arith.index_castui %43 : i64 to index %45 = arith.extui %arg21 : i32 to i64 %46 = arith.extui %arg22 : i32 to i64 %47 = arith.shli %46, %c32_i64 : i64 %48 = arith.ori %45, %47 : i64 %49 = arith.index_castui %48 : i64 to index %50 = arith.extui %arg23 : i32 to i64 %51 = arith.extui %arg24 : i32 to i64 %52 = arith.shli %51, %c32_i64 : i64 %53 = arith.ori %50, %52 : i64 %54 = arith.index_castui %53 : i64 to index %55 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor> %56 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor> %57 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor>{%39, %44, %49, %54} %58 = flow.dispatch.workload.ordinal %19, 0 : index %59 = flow.dispatch.workload.ordinal %24, 1 : index %60 = flow.dispatch.workload.ordinal %29, 2 : index %61 = flow.dispatch.workload.ordinal %34, 3 : index %62 = flow.dispatch.tensor.load %57, offsets = [0, 0, 0, 0], sizes = [%58, %59, %60, %61], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%39, %44, %49, %54} -> tensor %63 = flow.dispatch.tensor.load %55, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %64 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %62 : tensor to tensor<2x6x11x13xf32> %65 = linalg.fill ins(%cst : f32) outs(%64 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %66 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %63 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%65 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %66, %56, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13_i32 = arith.constant 13 : i32 %c11_i32 = arith.constant 11 : i32 %c6_i32 = arith.constant 6 : i32 %c2_i32 = arith.constant 2 : i32 %c32_i64 = arith.constant 32 : i64 %c0_i32 = arith.constant 0 : i32 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c2_i32, %c0_i32, %c6_i32, %c0_i32, %c11_i32, %c0_i32, %c13_i32, %c0_i32, %11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32, %arg12: i32, %arg13: i32, %arg14: i32, %arg15: i32, %arg16: i32, %arg17: i32, %arg18: i32, %arg19: i32, %arg20: i32, %arg21: i32, %arg22: i32, %arg23: i32, %arg24: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 {stream.alignment = 2 : index, stream.values = [2 : index]} : i64 to index %20 = arith.extui %arg11 : i32 to i64 %21 = arith.extui %arg12 : i32 to i64 %22 = arith.shli %21, %c32_i64 : i64 %23 = arith.ori %20, %22 : i64 %24 = arith.index_castui %23 {stream.alignment = 2 : index, stream.values = [6 : index]} : i64 to index %25 = arith.extui %arg13 : i32 to i64 %26 = arith.extui %arg14 : i32 to i64 %27 = arith.shli %26, %c32_i64 : i64 %28 = arith.ori %25, %27 : i64 %29 = arith.index_castui %28 {stream.values = [11 : index]} : i64 to index %30 = arith.extui %arg15 : i32 to i64 %31 = arith.extui %arg16 : i32 to i64 %32 = arith.shli %31, %c32_i64 : i64 %33 = arith.ori %30, %32 : i64 %34 = arith.index_castui %33 {stream.values = [13 : index]} : i64 to index %35 = arith.extui %arg17 : i32 to i64 %36 = arith.extui %arg18 : i32 to i64 %37 = arith.shli %36, %c32_i64 : i64 %38 = arith.ori %35, %37 : i64 %39 = arith.index_castui %38 : i64 to index %40 = arith.extui %arg19 : i32 to i64 %41 = arith.extui %arg20 : i32 to i64 %42 = arith.shli %41, %c32_i64 : i64 %43 = arith.ori %40, %42 : i64 %44 = arith.index_castui %43 : i64 to index %45 = arith.extui %arg21 : i32 to i64 %46 = arith.extui %arg22 : i32 to i64 %47 = arith.shli %46, %c32_i64 : i64 %48 = arith.ori %45, %47 : i64 %49 = arith.index_castui %48 : i64 to index %50 = arith.extui %arg23 : i32 to i64 %51 = arith.extui %arg24 : i32 to i64 %52 = arith.shli %51, %c32_i64 : i64 %53 = arith.ori %50, %52 : i64 %54 = arith.index_castui %53 : i64 to index %55 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor> %56 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor> %57 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor>{%39, %44, %49, %54} %58 = flow.dispatch.workload.ordinal %19, 0 : index %59 = flow.dispatch.workload.ordinal %24, 1 : index %60 = flow.dispatch.workload.ordinal %29, 2 : index %61 = flow.dispatch.workload.ordinal %34, 3 : index %62 = flow.dispatch.tensor.load %57, offsets = [0, 0, 0, 0], sizes = [%58, %59, %60, %61], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%39, %44, %49, %54} -> tensor %63 = flow.dispatch.tensor.load %55, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %64 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %62 : tensor to tensor<2x6x11x13xf32> %65 = linalg.fill ins(%cst : f32) outs(%64 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %66 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %63 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%65 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %66, %56, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13_i32 = arith.constant 13 : i32 %c11_i32 = arith.constant 11 : i32 %c6_i32 = arith.constant 6 : i32 %c2_i32 = arith.constant 2 : i32 %c32_i64 = arith.constant 32 : i64 %c0_i32 = arith.constant 0 : i32 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c2_i32, %c0_i32, %c6_i32, %c0_i32, %c11_i32, %c0_i32, %c13_i32, %c0_i32, %11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32, %arg12: i32, %arg13: i32, %arg14: i32, %arg15: i32, %arg16: i32, %arg17: i32, %arg18: i32, %arg19: i32, %arg20: i32, %arg21: i32, %arg22: i32, %arg23: i32, %arg24: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 {stream.alignment = 2 : index, stream.values = [2 : index]} : i64 to index %20 = arith.extui %arg11 : i32 to i64 %21 = arith.extui %arg12 : i32 to i64 %22 = arith.shli %21, %c32_i64 : i64 %23 = arith.ori %20, %22 : i64 %24 = arith.index_castui %23 {stream.alignment = 2 : index, stream.values = [6 : index]} : i64 to index %25 = arith.extui %arg13 : i32 to i64 %26 = arith.extui %arg14 : i32 to i64 %27 = arith.shli %26, %c32_i64 : i64 %28 = arith.ori %25, %27 : i64 %29 = arith.index_castui %28 {stream.values = [11 : index]} : i64 to index %30 = arith.extui %arg15 : i32 to i64 %31 = arith.extui %arg16 : i32 to i64 %32 = arith.shli %31, %c32_i64 : i64 %33 = arith.ori %30, %32 : i64 %34 = arith.index_castui %33 {stream.values = [13 : index]} : i64 to index %35 = arith.extui %arg17 : i32 to i64 %36 = arith.extui %arg18 : i32 to i64 %37 = arith.shli %36, %c32_i64 : i64 %38 = arith.ori %35, %37 : i64 %39 = arith.index_castui %38 : i64 to index %40 = arith.extui %arg19 : i32 to i64 %41 = arith.extui %arg20 : i32 to i64 %42 = arith.shli %41, %c32_i64 : i64 %43 = arith.ori %40, %42 : i64 %44 = arith.index_castui %43 : i64 to index %45 = arith.extui %arg21 : i32 to i64 %46 = arith.extui %arg22 : i32 to i64 %47 = arith.shli %46, %c32_i64 : i64 %48 = arith.ori %45, %47 : i64 %49 = arith.index_castui %48 : i64 to index %50 = arith.extui %arg23 : i32 to i64 %51 = arith.extui %arg24 : i32 to i64 %52 = arith.shli %51, %c32_i64 : i64 %53 = arith.ori %50, %52 : i64 %54 = arith.index_castui %53 : i64 to index %55 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor> %56 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor> %57 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor>{%39, %44, %49, %54} %58 = flow.dispatch.workload.ordinal %19, 0 : index %59 = flow.dispatch.workload.ordinal %24, 1 : index %60 = flow.dispatch.workload.ordinal %29, 2 : index %61 = flow.dispatch.workload.ordinal %34, 3 : index %62 = flow.dispatch.tensor.load %57, offsets = [0, 0, 0, 0], sizes = [%58, %59, %60, %61], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%39, %44, %49, %54} -> tensor %63 = flow.dispatch.tensor.load %55, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %64 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %62 : tensor to tensor<2x6x11x13xf32> %65 = linalg.fill ins(%cst : f32) outs(%64 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %66 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %63 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%65 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %66, %56, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13_i32 = arith.constant 13 : i32 %c11_i32 = arith.constant 11 : i32 %c6_i32 = arith.constant 6 : i32 %c2_i32 = arith.constant 2 : i32 %c32_i64 = arith.constant 32 : i64 %c0_i32 = arith.constant 0 : i32 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c2_i32, %c0_i32, %c6_i32, %c0_i32, %c11_i32, %c0_i32, %c13_i32, %c0_i32, %11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32, %arg12: i32, %arg13: i32, %arg14: i32, %arg15: i32, %arg16: i32, %arg17: i32, %arg18: i32, %arg19: i32, %arg20: i32, %arg21: i32, %arg22: i32, %arg23: i32, %arg24: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 {stream.alignment = 2 : index, stream.values = [2 : index]} : i64 to index %20 = arith.extui %arg11 : i32 to i64 %21 = arith.extui %arg12 : i32 to i64 %22 = arith.shli %21, %c32_i64 : i64 %23 = arith.ori %20, %22 : i64 %24 = arith.index_castui %23 {stream.alignment = 2 : index, stream.values = [6 : index]} : i64 to index %25 = arith.extui %arg13 : i32 to i64 %26 = arith.extui %arg14 : i32 to i64 %27 = arith.shli %26, %c32_i64 : i64 %28 = arith.ori %25, %27 : i64 %29 = arith.index_castui %28 {stream.values = [11 : index]} : i64 to index %30 = arith.extui %arg15 : i32 to i64 %31 = arith.extui %arg16 : i32 to i64 %32 = arith.shli %31, %c32_i64 : i64 %33 = arith.ori %30, %32 : i64 %34 = arith.index_castui %33 {stream.values = [13 : index]} : i64 to index %35 = arith.extui %arg17 : i32 to i64 %36 = arith.extui %arg18 : i32 to i64 %37 = arith.shli %36, %c32_i64 : i64 %38 = arith.ori %35, %37 : i64 %39 = arith.index_castui %38 : i64 to index %40 = arith.extui %arg19 : i32 to i64 %41 = arith.extui %arg20 : i32 to i64 %42 = arith.shli %41, %c32_i64 : i64 %43 = arith.ori %40, %42 : i64 %44 = arith.index_castui %43 : i64 to index %45 = arith.extui %arg21 : i32 to i64 %46 = arith.extui %arg22 : i32 to i64 %47 = arith.shli %46, %c32_i64 : i64 %48 = arith.ori %45, %47 : i64 %49 = arith.index_castui %48 : i64 to index %50 = arith.extui %arg23 : i32 to i64 %51 = arith.extui %arg24 : i32 to i64 %52 = arith.shli %51, %c32_i64 : i64 %53 = arith.ori %50, %52 : i64 %54 = arith.index_castui %53 : i64 to index %55 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor> %56 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor> %57 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor>{%39, %44, %49, %54} %58 = flow.dispatch.workload.ordinal %19, 0 : index %59 = flow.dispatch.workload.ordinal %24, 1 : index %60 = flow.dispatch.workload.ordinal %29, 2 : index %61 = flow.dispatch.workload.ordinal %34, 3 : index %62 = flow.dispatch.tensor.load %57, offsets = [0, 0, 0, 0], sizes = [%58, %59, %60, %61], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%39, %44, %49, %54} -> tensor %63 = flow.dispatch.tensor.load %55, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %64 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %62 : tensor to tensor<2x6x11x13xf32> %65 = linalg.fill ins(%cst : f32) outs(%64 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %66 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %63 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%65 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %66, %56, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13_i32 = arith.constant 13 : i32 %c11_i32 = arith.constant 11 : i32 %c6_i32 = arith.constant 6 : i32 %c2_i32 = arith.constant 2 : i32 %c32_i64 = arith.constant 32 : i64 %c0_i32 = arith.constant 0 : i32 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c2_i32, %c0_i32, %c6_i32, %c0_i32, %c11_i32, %c0_i32, %c13_i32, %c0_i32, %11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump Before IPO (iree-util-ipo) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32, %arg12: i32, %arg13: i32, %arg14: i32, %arg15: i32, %arg16: i32, %arg17: i32, %arg18: i32, %arg19: i32, %arg20: i32, %arg21: i32, %arg22: i32, %arg23: i32, %arg24: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 {stream.alignment = 2 : index, stream.values = [2 : index]} : i64 to index %20 = arith.extui %arg11 : i32 to i64 %21 = arith.extui %arg12 : i32 to i64 %22 = arith.shli %21, %c32_i64 : i64 %23 = arith.ori %20, %22 : i64 %24 = arith.index_castui %23 {stream.alignment = 2 : index, stream.values = [6 : index]} : i64 to index %25 = arith.extui %arg13 : i32 to i64 %26 = arith.extui %arg14 : i32 to i64 %27 = arith.shli %26, %c32_i64 : i64 %28 = arith.ori %25, %27 : i64 %29 = arith.index_castui %28 {stream.values = [11 : index]} : i64 to index %30 = arith.extui %arg15 : i32 to i64 %31 = arith.extui %arg16 : i32 to i64 %32 = arith.shli %31, %c32_i64 : i64 %33 = arith.ori %30, %32 : i64 %34 = arith.index_castui %33 {stream.values = [13 : index]} : i64 to index %35 = arith.extui %arg17 : i32 to i64 %36 = arith.extui %arg18 : i32 to i64 %37 = arith.shli %36, %c32_i64 : i64 %38 = arith.ori %35, %37 : i64 %39 = arith.index_castui %38 : i64 to index %40 = arith.extui %arg19 : i32 to i64 %41 = arith.extui %arg20 : i32 to i64 %42 = arith.shli %41, %c32_i64 : i64 %43 = arith.ori %40, %42 : i64 %44 = arith.index_castui %43 : i64 to index %45 = arith.extui %arg21 : i32 to i64 %46 = arith.extui %arg22 : i32 to i64 %47 = arith.shli %46, %c32_i64 : i64 %48 = arith.ori %45, %47 : i64 %49 = arith.index_castui %48 : i64 to index %50 = arith.extui %arg23 : i32 to i64 %51 = arith.extui %arg24 : i32 to i64 %52 = arith.shli %51, %c32_i64 : i64 %53 = arith.ori %50, %52 : i64 %54 = arith.index_castui %53 : i64 to index %55 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor> %56 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor> %57 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor>{%39, %44, %49, %54} %58 = flow.dispatch.workload.ordinal %19, 0 : index %59 = flow.dispatch.workload.ordinal %24, 1 : index %60 = flow.dispatch.workload.ordinal %29, 2 : index %61 = flow.dispatch.workload.ordinal %34, 3 : index %62 = flow.dispatch.tensor.load %57, offsets = [0, 0, 0, 0], sizes = [%58, %59, %60, %61], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%39, %44, %49, %54} -> tensor %63 = flow.dispatch.tensor.load %55, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %64 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %62 : tensor to tensor<2x6x11x13xf32> %65 = linalg.fill ins(%cst : f32) outs(%64 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %66 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %63 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%65 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %66, %56, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13_i32 = arith.constant 13 : i32 %c11_i32 = arith.constant 11 : i32 %c6_i32 = arith.constant 6 : i32 %c2_i32 = arith.constant 2 : i32 %c32_i64 = arith.constant 32 : i64 %c0_i32 = arith.constant 0 : i32 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c2_i32, %c0_i32, %c6_i32, %c0_i32, %c11_i32, %c0_i32, %c13_i32, %c0_i32, %11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump After IPO (iree-util-ipo) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32, %arg12: i32, %arg13: i32, %arg14: i32, %arg15: i32, %arg16: i32, %arg17: i32, %arg18: i32, %arg19: i32, %arg20: i32, %arg21: i32, %arg22: i32, %arg23: i32, %arg24: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 {stream.alignment = 2 : index, stream.values = [2 : index]} : i64 to index %20 = arith.extui %arg11 : i32 to i64 %21 = arith.extui %arg12 : i32 to i64 %22 = arith.shli %21, %c32_i64 : i64 %23 = arith.ori %20, %22 : i64 %24 = arith.index_castui %23 {stream.alignment = 2 : index, stream.values = [6 : index]} : i64 to index %25 = arith.extui %arg13 : i32 to i64 %26 = arith.extui %arg14 : i32 to i64 %27 = arith.shli %26, %c32_i64 : i64 %28 = arith.ori %25, %27 : i64 %29 = arith.index_castui %28 {stream.values = [11 : index]} : i64 to index %30 = arith.extui %arg15 : i32 to i64 %31 = arith.extui %arg16 : i32 to i64 %32 = arith.shli %31, %c32_i64 : i64 %33 = arith.ori %30, %32 : i64 %34 = arith.index_castui %33 {stream.values = [13 : index]} : i64 to index %35 = arith.extui %arg17 : i32 to i64 %36 = arith.extui %arg18 : i32 to i64 %37 = arith.shli %36, %c32_i64 : i64 %38 = arith.ori %35, %37 : i64 %39 = arith.index_castui %38 : i64 to index %40 = arith.extui %arg19 : i32 to i64 %41 = arith.extui %arg20 : i32 to i64 %42 = arith.shli %41, %c32_i64 : i64 %43 = arith.ori %40, %42 : i64 %44 = arith.index_castui %43 : i64 to index %45 = arith.extui %arg21 : i32 to i64 %46 = arith.extui %arg22 : i32 to i64 %47 = arith.shli %46, %c32_i64 : i64 %48 = arith.ori %45, %47 : i64 %49 = arith.index_castui %48 : i64 to index %50 = arith.extui %arg23 : i32 to i64 %51 = arith.extui %arg24 : i32 to i64 %52 = arith.shli %51, %c32_i64 : i64 %53 = arith.ori %50, %52 : i64 %54 = arith.index_castui %53 : i64 to index %55 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor> %56 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor> %57 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor>{%39, %44, %49, %54} %58 = flow.dispatch.workload.ordinal %19, 0 : index %59 = flow.dispatch.workload.ordinal %24, 1 : index %60 = flow.dispatch.workload.ordinal %29, 2 : index %61 = flow.dispatch.workload.ordinal %34, 3 : index %62 = flow.dispatch.tensor.load %57, offsets = [0, 0, 0, 0], sizes = [%58, %59, %60, %61], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%39, %44, %49, %54} -> tensor %63 = flow.dispatch.tensor.load %55, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %64 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %62 : tensor to tensor<2x6x11x13xf32> %65 = linalg.fill ins(%cst : f32) outs(%64 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %66 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %63 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%65 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %66, %56, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13_i32 = arith.constant 13 : i32 %c11_i32 = arith.constant 11 : i32 %c6_i32 = arith.constant 6 : i32 %c2_i32 = arith.constant 2 : i32 %c32_i64 = arith.constant 32 : i64 %c0_i32 = arith.constant 0 : i32 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c2_i32, %c0_i32, %c6_i32, %c0_i32, %c11_i32, %c0_i32, %c13_i32, %c0_i32, %11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump Before FoldUniformOperandsPass (iree-stream-fold-uniform-operands) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32, %arg12: i32, %arg13: i32, %arg14: i32, %arg15: i32, %arg16: i32, %arg17: i32, %arg18: i32, %arg19: i32, %arg20: i32, %arg21: i32, %arg22: i32, %arg23: i32, %arg24: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 {stream.alignment = 2 : index, stream.values = [2 : index]} : i64 to index %20 = arith.extui %arg11 : i32 to i64 %21 = arith.extui %arg12 : i32 to i64 %22 = arith.shli %21, %c32_i64 : i64 %23 = arith.ori %20, %22 : i64 %24 = arith.index_castui %23 {stream.alignment = 2 : index, stream.values = [6 : index]} : i64 to index %25 = arith.extui %arg13 : i32 to i64 %26 = arith.extui %arg14 : i32 to i64 %27 = arith.shli %26, %c32_i64 : i64 %28 = arith.ori %25, %27 : i64 %29 = arith.index_castui %28 {stream.values = [11 : index]} : i64 to index %30 = arith.extui %arg15 : i32 to i64 %31 = arith.extui %arg16 : i32 to i64 %32 = arith.shli %31, %c32_i64 : i64 %33 = arith.ori %30, %32 : i64 %34 = arith.index_castui %33 {stream.values = [13 : index]} : i64 to index %35 = arith.extui %arg17 : i32 to i64 %36 = arith.extui %arg18 : i32 to i64 %37 = arith.shli %36, %c32_i64 : i64 %38 = arith.ori %35, %37 : i64 %39 = arith.index_castui %38 : i64 to index %40 = arith.extui %arg19 : i32 to i64 %41 = arith.extui %arg20 : i32 to i64 %42 = arith.shli %41, %c32_i64 : i64 %43 = arith.ori %40, %42 : i64 %44 = arith.index_castui %43 : i64 to index %45 = arith.extui %arg21 : i32 to i64 %46 = arith.extui %arg22 : i32 to i64 %47 = arith.shli %46, %c32_i64 : i64 %48 = arith.ori %45, %47 : i64 %49 = arith.index_castui %48 : i64 to index %50 = arith.extui %arg23 : i32 to i64 %51 = arith.extui %arg24 : i32 to i64 %52 = arith.shli %51, %c32_i64 : i64 %53 = arith.ori %50, %52 : i64 %54 = arith.index_castui %53 : i64 to index %55 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor> %56 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor> %57 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor>{%39, %44, %49, %54} %58 = flow.dispatch.workload.ordinal %19, 0 : index %59 = flow.dispatch.workload.ordinal %24, 1 : index %60 = flow.dispatch.workload.ordinal %29, 2 : index %61 = flow.dispatch.workload.ordinal %34, 3 : index %62 = flow.dispatch.tensor.load %57, offsets = [0, 0, 0, 0], sizes = [%58, %59, %60, %61], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%39, %44, %49, %54} -> tensor %63 = flow.dispatch.tensor.load %55, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %64 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %62 : tensor to tensor<2x6x11x13xf32> %65 = linalg.fill ins(%cst : f32) outs(%64 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %66 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %63 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%65 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %66, %56, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13_i32 = arith.constant 13 : i32 %c11_i32 = arith.constant 11 : i32 %c6_i32 = arith.constant 6 : i32 %c2_i32 = arith.constant 2 : i32 %c32_i64 = arith.constant 32 : i64 %c0_i32 = arith.constant 0 : i32 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c2_i32, %c0_i32, %c6_i32, %c0_i32, %c11_i32, %c0_i32, %c13_i32, %c0_i32, %11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump After FoldUniformOperandsPass (iree-stream-fold-uniform-operands) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %c0_i32 = arith.constant 0 : i32 %c2_i32 = arith.constant 2 : i32 %c6_i32 = arith.constant 6 : i32 %c11_i32 = arith.constant 11 : i32 %c13_i32 = arith.constant 13 : i32 %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %0 = arith.extui %c0_i32 : i32 to i64 %1 = arith.extui %c0_i32 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index %5 = arith.extui %c0_i32 : i32 to i64 %6 = arith.extui %c0_i32 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index %10 = arith.extui %c0_i32 : i32 to i64 %11 = arith.extui %c0_i32 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index %15 = arith.extui %c2_i32 : i32 to i64 %16 = arith.extui %c0_i32 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 {stream.alignment = 2 : index, stream.values = [2 : index]} : i64 to index %20 = arith.extui %c6_i32 : i32 to i64 %21 = arith.extui %c0_i32 : i32 to i64 %22 = arith.shli %21, %c32_i64 : i64 %23 = arith.ori %20, %22 : i64 %24 = arith.index_castui %23 {stream.alignment = 2 : index, stream.values = [6 : index]} : i64 to index %25 = arith.extui %c11_i32 : i32 to i64 %26 = arith.extui %c0_i32 : i32 to i64 %27 = arith.shli %26, %c32_i64 : i64 %28 = arith.ori %25, %27 : i64 %29 = arith.index_castui %28 {stream.values = [11 : index]} : i64 to index %30 = arith.extui %c13_i32 : i32 to i64 %31 = arith.extui %c0_i32 : i32 to i64 %32 = arith.shli %31, %c32_i64 : i64 %33 = arith.ori %30, %32 : i64 %34 = arith.index_castui %33 {stream.values = [13 : index]} : i64 to index %35 = arith.extui %arg3 : i32 to i64 %36 = arith.extui %arg4 : i32 to i64 %37 = arith.shli %36, %c32_i64 : i64 %38 = arith.ori %35, %37 : i64 %39 = arith.index_castui %38 : i64 to index %40 = arith.extui %arg5 : i32 to i64 %41 = arith.extui %arg6 : i32 to i64 %42 = arith.shli %41, %c32_i64 : i64 %43 = arith.ori %40, %42 : i64 %44 = arith.index_castui %43 : i64 to index %45 = arith.extui %arg7 : i32 to i64 %46 = arith.extui %arg8 : i32 to i64 %47 = arith.shli %46, %c32_i64 : i64 %48 = arith.ori %45, %47 : i64 %49 = arith.index_castui %48 : i64 to index %50 = arith.extui %arg9 : i32 to i64 %51 = arith.extui %arg10 : i32 to i64 %52 = arith.shli %51, %c32_i64 : i64 %53 = arith.ori %50, %52 : i64 %54 = arith.index_castui %53 : i64 to index %55 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor> %56 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor> %57 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor>{%39, %44, %49, %54} %58 = flow.dispatch.workload.ordinal %19, 0 : index %59 = flow.dispatch.workload.ordinal %24, 1 : index %60 = flow.dispatch.workload.ordinal %29, 2 : index %61 = flow.dispatch.workload.ordinal %34, 3 : index %62 = flow.dispatch.tensor.load %57, offsets = [0, 0, 0, 0], sizes = [%58, %59, %60, %61], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%39, %44, %49, %54} -> tensor %63 = flow.dispatch.tensor.load %55, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %64 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %62 : tensor to tensor<2x6x11x13xf32> %65 = linalg.fill ins(%cst : f32) outs(%64 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %66 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %63 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%65 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %66, %56, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13_i32 = arith.constant 13 : i32 %c11_i32 = arith.constant 11 : i32 %c6_i32 = arith.constant 6 : i32 %c2_i32 = arith.constant 2 : i32 %c32_i64 = arith.constant 32 : i64 %c0_i32 = arith.constant 0 : i32 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13_i32 = arith.constant 13 : i32 %c11_i32 = arith.constant 11 : i32 %c6_i32 = arith.constant 6 : i32 %c2_i32 = arith.constant 2 : i32 %c32_i64 = arith.constant 32 : i64 %c0_i32 = arith.constant 0 : i32 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } // -----// IR Dump After Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } // -----// IR Dump Before CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } // -----// IR Dump After CSE (cse) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } // -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } // -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %c0_i32 = arith.constant 0 : i32 %c2_i32 = arith.constant 2 : i32 %c6_i32 = arith.constant 6 : i32 %c11_i32 = arith.constant 11 : i32 %c13_i32 = arith.constant 13 : i32 %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %0 = arith.extui %c0_i32 : i32 to i64 %1 = arith.extui %c0_i32 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index %5 = arith.extui %c0_i32 : i32 to i64 %6 = arith.extui %c0_i32 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index %10 = arith.extui %c0_i32 : i32 to i64 %11 = arith.extui %c0_i32 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index %15 = arith.extui %c2_i32 : i32 to i64 %16 = arith.extui %c0_i32 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 {stream.alignment = 2 : index, stream.values = [2 : index]} : i64 to index %20 = arith.extui %c6_i32 : i32 to i64 %21 = arith.extui %c0_i32 : i32 to i64 %22 = arith.shli %21, %c32_i64 : i64 %23 = arith.ori %20, %22 : i64 %24 = arith.index_castui %23 {stream.alignment = 2 : index, stream.values = [6 : index]} : i64 to index %25 = arith.extui %c11_i32 : i32 to i64 %26 = arith.extui %c0_i32 : i32 to i64 %27 = arith.shli %26, %c32_i64 : i64 %28 = arith.ori %25, %27 : i64 %29 = arith.index_castui %28 {stream.values = [11 : index]} : i64 to index %30 = arith.extui %c13_i32 : i32 to i64 %31 = arith.extui %c0_i32 : i32 to i64 %32 = arith.shli %31, %c32_i64 : i64 %33 = arith.ori %30, %32 : i64 %34 = arith.index_castui %33 {stream.values = [13 : index]} : i64 to index %35 = arith.extui %arg3 : i32 to i64 %36 = arith.extui %arg4 : i32 to i64 %37 = arith.shli %36, %c32_i64 : i64 %38 = arith.ori %35, %37 : i64 %39 = arith.index_castui %38 : i64 to index %40 = arith.extui %arg5 : i32 to i64 %41 = arith.extui %arg6 : i32 to i64 %42 = arith.shli %41, %c32_i64 : i64 %43 = arith.ori %40, %42 : i64 %44 = arith.index_castui %43 : i64 to index %45 = arith.extui %arg7 : i32 to i64 %46 = arith.extui %arg8 : i32 to i64 %47 = arith.shli %46, %c32_i64 : i64 %48 = arith.ori %45, %47 : i64 %49 = arith.index_castui %48 : i64 to index %50 = arith.extui %arg9 : i32 to i64 %51 = arith.extui %arg10 : i32 to i64 %52 = arith.shli %51, %c32_i64 : i64 %53 = arith.ori %50, %52 : i64 %54 = arith.index_castui %53 : i64 to index %55 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor> %56 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor> %57 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor>{%39, %44, %49, %54} %58 = flow.dispatch.workload.ordinal %19, 0 : index %59 = flow.dispatch.workload.ordinal %24, 1 : index %60 = flow.dispatch.workload.ordinal %29, 2 : index %61 = flow.dispatch.workload.ordinal %34, 3 : index %62 = flow.dispatch.tensor.load %57, offsets = [0, 0, 0, 0], sizes = [%58, %59, %60, %61], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%39, %44, %49, %54} -> tensor %63 = flow.dispatch.tensor.load %55, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %64 = tensor.empty() : tensor<2x4x7x9xf32> %cast = tensor.cast %62 : tensor to tensor<2x6x11x13xf32> %65 = linalg.fill ins(%cst : f32) outs(%64 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %66 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %63 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%65 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %66, %56, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump Before IPO (iree-util-ipo) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump After IPO (iree-util-ipo) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump Before SymbolDCE (symbol-dce) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump After SymbolDCE (symbol-dce) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump Before AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump After AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump Before MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump Before ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump Before ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump Before VerifyDevicesPass (iree-hal-verify-devices) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump Before CSE (cse) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump After CSE (cse) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump After Canonicalizer (canonicalize) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump Before CSE (cse) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump After CSE (cse) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } // -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump Before VerifyDevicesPass (iree-hal-verify-devices) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump Before MaterializeInterfacesPass (iree-hal-materialize-interfaces) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local stream.executable private @main_dispatch_0 { stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) { %cst = arith.constant 0.000000e+00 : f32 %c32_i64 = arith.constant 32 : i64 %c0 = arith.constant 0 : index %0 = arith.extui %arg3 : i32 to i64 %1 = arith.extui %arg4 : i32 to i64 %2 = arith.shli %1, %c32_i64 : i64 %3 = arith.ori %0, %2 : i64 %4 = arith.index_castui %3 : i64 to index %5 = arith.extui %arg5 : i32 to i64 %6 = arith.extui %arg6 : i32 to i64 %7 = arith.shli %6, %c32_i64 : i64 %8 = arith.ori %5, %7 : i64 %9 = arith.index_castui %8 : i64 to index %10 = arith.extui %arg7 : i32 to i64 %11 = arith.extui %arg8 : i32 to i64 %12 = arith.shli %11, %c32_i64 : i64 %13 = arith.ori %10, %12 : i64 %14 = arith.index_castui %13 : i64 to index %15 = arith.extui %arg9 : i32 to i64 %16 = arith.extui %arg10 : i32 to i64 %17 = arith.shli %16, %c32_i64 : i64 %18 = arith.ori %15, %17 : i64 %19 = arith.index_castui %18 : i64 to index %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor> %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor> %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor>{%4, %9, %14, %19} %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32> %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %25 = tensor.empty() : tensor<2x4x7x9xf32> %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump After MaterializeInterfacesPass (iree-hal-materialize-interfaces) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32 %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32 %6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : i32 %7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32> %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %33 = tensor.empty() : tensor<2x4x7x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump Before PruneExecutablesPass (iree-hal-prune-executables) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32 %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32 %6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : i32 %7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32> %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %33 = tensor.empty() : tensor<2x4x7x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump After PruneExecutablesPass (iree-hal-prune-executables) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32 %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32 %6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : i32 %7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32> %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %33 = tensor.empty() : tensor<2x4x7x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump Before ConfigureExecutablesPass (iree-hal-configure-executables) //----- // hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32> %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %33 = tensor.empty() : tensor<2x4x7x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } } // -----// IR Dump Before ConfigureTargetExecutableVariantsPass (iree-hal-configure-target-executable-variants) //----- // hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32> %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %33 = tensor.empty() : tensor<2x4x7x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } // -----// IR Dump Before TypePropagationPass (iree-codegen-type-propagation) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32> %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %33 = tensor.empty() : tensor<2x4x7x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } // -----// IR Dump After TypePropagationPass (iree-codegen-type-propagation) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32> %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %33 = tensor.empty() : tensor<2x4x7x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } // -----// IR Dump Before BubbleUpOrdinalOpsPass (iree-codegen-bubble-up-ordinal-ops) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32> %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %33 = tensor.empty() : tensor<2x4x7x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } // -----// IR Dump After BubbleUpOrdinalOpsPass (iree-codegen-bubble-up-ordinal-ops) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32> %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %33 = tensor.empty() : tensor<2x4x7x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } // -----// IR Dump Before BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32> %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %33 = tensor.empty() : tensor<2x4x7x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } // -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32> %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %33 = tensor.empty() : tensor<2x4x7x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } // -----// IR Dump Before DecomposeSoftmaxPass (iree-codegen-decompose-softmax) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32> %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %33 = tensor.empty() : tensor<2x4x7x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } // -----// IR Dump After DecomposeSoftmaxPass (iree-codegen-decompose-softmax) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32> %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %33 = tensor.empty() : tensor<2x4x7x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } // -----// IR Dump Before MaterializeUserConfigsPass (iree-codegen-materialize-user-configs) //----- // module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32> %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %33 = tensor.empty() : tensor<2x4x7x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } // -----// IR Dump After MaterializeUserConfigsPass (iree-codegen-materialize-user-configs) //----- // module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32> %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %33 = tensor.empty() : tensor<2x4x7x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } // -----// IR Dump Before RematerializeParallelOpsPass (iree-codegen-rematerialize-parallel-ops) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32> %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %33 = tensor.empty() : tensor<2x4x7x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } // -----// IR Dump After RematerializeParallelOpsPass (iree-codegen-rematerialize-parallel-ops) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32> %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %33 = tensor.empty() : tensor<2x4x7x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } // -----// IR Dump Before ExpandF16OpToF32Pass (iree-llvmcpu-expand-f16-op-to-f32) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32> %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %33 = tensor.empty() : tensor<2x4x7x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } // -----// IR Dump After ExpandF16OpToF32Pass (iree-llvmcpu-expand-f16-op-to-f32) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32> %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %33 = tensor.empty() : tensor<2x4x7x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } // -----// IR Dump Before CPUMaterializeDeviceEncodingPass (iree-codegen-cpu-materialize-device-encoding) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32> %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %33 = tensor.empty() : tensor<2x4x7x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } // -----// IR Dump After CPUMaterializeDeviceEncodingPass (iree-codegen-cpu-materialize-device-encoding) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32> %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %33 = tensor.empty() : tensor<2x4x7x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } // -----// IR Dump Before EraseHALDescriptorTypeFromMemRefPass (iree-codegen-erase-hal-descriptor-type-from-memref) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32> %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %33 = tensor.empty() : tensor<2x4x7x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } // -----// IR Dump After EraseHALDescriptorTypeFromMemRefPass (iree-codegen-erase-hal-descriptor-type-from-memref) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32> %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %33 = tensor.empty() : tensor<2x4x7x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } // -----// IR Dump Before LLVMCPUSelectLoweringStrategyPass (iree-llvmcpu-select-lowering-strategy) //----- // module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32> %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %33 = tensor.empty() : tensor<2x4x7x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } // -----// IR Dump After LLVMCPUSelectLoweringStrategyPass (iree-llvmcpu-select-lowering-strategy) //----- // module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32> %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %33 = tensor.empty() : tensor<2x4x7x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } // -----// IR Dump After ConfigureTargetExecutableVariantsPass (iree-hal-configure-target-executable-variants) //----- // hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32> %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %33 = tensor.empty() : tensor<2x4x7x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } // -----// IR Dump After ConfigureExecutablesPass (iree-hal-configure-executables) //----- // hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32> %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %33 = tensor.empty() : tensor<2x4x7x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } } // -----// IR Dump Before TranslateExecutablesPass (iree-hal-translate-executables) //----- // hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32> %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %33 = tensor.empty() : tensor<2x4x7x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } } // -----// IR Dump Before TranslateTargetExecutableVariantsPass (iree-hal-translate-target-executable-variants) //----- // hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32> %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %33 = tensor.empty() : tensor<2x4x7x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } } // -----// IR Dump Before LowerExecutableUsingTransformDialectPass (iree-codegen-lower-executable-using-transform-dialect) //----- // module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32> %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %33 = tensor.empty() : tensor<2x4x7x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } // -----// IR Dump After LowerExecutableUsingTransformDialectPass (iree-codegen-lower-executable-using-transform-dialect) //----- // module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32> %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %33 = tensor.empty() : tensor<2x4x7x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } } // -----// IR Dump Before LLVMCPULowerExecutableTargetPass (iree-llvmcpu-lower-executable-target) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32> %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %33 = tensor.empty() : tensor<2x4x7x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } // -----// IR Dump Before TileAndDistributeToWorkgroupsPass (iree-codegen-tile-and-distribute-to-workgroups) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32> %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x6x5x5xf32> %33 = tensor.empty() : tensor<2x4x7x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor> return } // -----// IR Dump After TileAndDistributeToWorkgroupsPass (iree-codegen-tile-and-distribute-to-workgroups) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c13 = arith.constant 13 : index %c9 = arith.constant 9 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, %c0], sizes = [2, 6, 5, %c13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x?xf32> %32 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %33 = tensor.empty() : tensor<2x1x1x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x5x?xf32>, tensor<1x6x5x5xf32>) outs(%34 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32> %cast = tensor.cast %35 : tensor<2x1x1x9xf32> to tensor<2x1x1x?xf32> flow.dispatch.tensor.store %cast, %29, offsets = [0, %arg0, %arg1, %c0], sizes = [2, 1, 1, %c9], strides = [1, 1, 1, 1] : tensor<2x1x1x?xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump Before CSE (cse) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c13 = arith.constant 13 : index %c9 = arith.constant 9 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, %c0], sizes = [2, 6, 5, %c13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x?xf32> %32 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %33 = tensor.empty() : tensor<2x1x1x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x5x?xf32>, tensor<1x6x5x5xf32>) outs(%34 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32> %cast = tensor.cast %35 : tensor<2x1x1x9xf32> to tensor<2x1x1x?xf32> flow.dispatch.tensor.store %cast, %29, offsets = [0, %arg0, %arg1, %c0], sizes = [2, 1, 1, %c9], strides = [1, 1, 1, 1] : tensor<2x1x1x?xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump After CSE (cse) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c13 = arith.constant 13 : index %c9 = arith.constant 9 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, %c0], sizes = [2, 6, 5, %c13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x?xf32> %32 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %33 = tensor.empty() : tensor<2x1x1x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x5x?xf32>, tensor<1x6x5x5xf32>) outs(%34 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32> %cast = tensor.cast %35 : tensor<2x1x1x9xf32> to tensor<2x1x1x?xf32> flow.dispatch.tensor.store %cast, %29, offsets = [0, %arg0, %arg1, %c0], sizes = [2, 1, 1, %c9], strides = [1, 1, 1, 1] : tensor<2x1x1x?xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump Before ConvertToDestinationPassingStylePass (iree-codegen-convert-to-destination-passing-style) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c13 = arith.constant 13 : index %c9 = arith.constant 9 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, %c0], sizes = [2, 6, 5, %c13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x?xf32> %32 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %33 = tensor.empty() : tensor<2x1x1x9xf32> %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x5x?xf32>, tensor<1x6x5x5xf32>) outs(%34 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32> %cast = tensor.cast %35 : tensor<2x1x1x9xf32> to tensor<2x1x1x?xf32> flow.dispatch.tensor.store %cast, %29, offsets = [0, %arg0, %arg1, %c0], sizes = [2, 1, 1, %c9], strides = [1, 1, 1, 1] : tensor<2x1x1x?xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump After ConvertToDestinationPassingStylePass (iree-codegen-convert-to-destination-passing-style) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c13 = arith.constant 13 : index %c9 = arith.constant 9 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, %c0], sizes = [2, 1, 1, %c9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x?xf32> %cast = tensor.cast %31 : tensor<2x1x1x?xf32> to tensor<2x1x1x9xf32> %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, %c0], sizes = [2, 6, 5, %c13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x?xf32> %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %34 = linalg.fill ins(%cst : f32) outs(%cast : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%32, %33 : tensor<2x6x5x?xf32>, tensor<1x6x5x5xf32>) outs(%34 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32> %cast_0 = tensor.cast %35 : tensor<2x1x1x9xf32> to tensor<2x1x1x?xf32> flow.dispatch.tensor.store %cast_0, %29, offsets = [0, %arg0, %arg1, %c0], sizes = [2, 1, 1, %c9], strides = [1, 1, 1, 1] : tensor<2x1x1x?xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump Before FoldAffineMinInDistributedLoopsPass (iree-codegen-fold-affinemin-in-distributed-loops) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c13 = arith.constant 13 : index %c9 = arith.constant 9 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, %c0], sizes = [2, 1, 1, %c9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x?xf32> %cast = tensor.cast %31 : tensor<2x1x1x?xf32> to tensor<2x1x1x9xf32> %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, %c0], sizes = [2, 6, 5, %c13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x?xf32> %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %34 = linalg.fill ins(%cst : f32) outs(%cast : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%32, %33 : tensor<2x6x5x?xf32>, tensor<1x6x5x5xf32>) outs(%34 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32> %cast_0 = tensor.cast %35 : tensor<2x1x1x9xf32> to tensor<2x1x1x?xf32> flow.dispatch.tensor.store %cast_0, %29, offsets = [0, %arg0, %arg1, %c0], sizes = [2, 1, 1, %c9], strides = [1, 1, 1, 1] : tensor<2x1x1x?xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump After FoldAffineMinInDistributedLoopsPass (iree-codegen-fold-affinemin-in-distributed-loops) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c13 = arith.constant 13 : index %c9 = arith.constant 9 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, %c0], sizes = [2, 1, 1, %c9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x?xf32> %cast = tensor.cast %31 : tensor<2x1x1x?xf32> to tensor<2x1x1x9xf32> %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, %c0], sizes = [2, 6, 5, %c13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x?xf32> %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %34 = linalg.fill ins(%cst : f32) outs(%cast : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%32, %33 : tensor<2x6x5x?xf32>, tensor<1x6x5x5xf32>) outs(%34 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32> %cast_0 = tensor.cast %35 : tensor<2x1x1x9xf32> to tensor<2x1x1x?xf32> flow.dispatch.tensor.store %cast_0, %29, offsets = [0, %arg0, %arg1, %c0], sizes = [2, 1, 1, %c9], strides = [1, 1, 1, 1] : tensor<2x1x1x?xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c13 = arith.constant 13 : index %c9 = arith.constant 9 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, %c0], sizes = [2, 1, 1, %c9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x?xf32> %cast = tensor.cast %31 : tensor<2x1x1x?xf32> to tensor<2x1x1x9xf32> %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, %c0], sizes = [2, 6, 5, %c13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x?xf32> %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %34 = linalg.fill ins(%cst : f32) outs(%cast : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%32, %33 : tensor<2x6x5x?xf32>, tensor<1x6x5x5xf32>) outs(%34 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32> %cast_0 = tensor.cast %35 : tensor<2x1x1x9xf32> to tensor<2x1x1x?xf32> flow.dispatch.tensor.store %cast_0, %29, offsets = [0, %arg0, %arg1, %c0], sizes = [2, 1, 1, %c9], strides = [1, 1, 1, 1] : tensor<2x1x1x?xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump After Canonicalizer (canonicalize) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %34 = linalg.fill ins(%cst : f32) outs(%31 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%32, %33 : tensor<2x6x5x13xf32>, tensor<1x6x5x5xf32>) outs(%34 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump Before CSE (cse) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %34 = linalg.fill ins(%cst : f32) outs(%31 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%32, %33 : tensor<2x6x5x13xf32>, tensor<1x6x5x5xf32>) outs(%34 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump After CSE (cse) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %34 = linalg.fill ins(%cst : f32) outs(%31 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%32, %33 : tensor<2x6x5x13xf32>, tensor<1x6x5x5xf32>) outs(%34 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump Before FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %34 = linalg.fill ins(%cst : f32) outs(%31 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%32, %33 : tensor<2x6x5x13xf32>, tensor<1x6x5x5xf32>) outs(%34 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %34 = linalg.fill ins(%cst : f32) outs(%31 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%32, %33 : tensor<2x6x5x13xf32>, tensor<1x6x5x5xf32>) outs(%34 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump Before ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %34 = linalg.fill ins(%cst : f32) outs(%31 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%32, %33 : tensor<2x6x5x13xf32>, tensor<1x6x5x5xf32>) outs(%34 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %34 = linalg.fill ins(%cst : f32) outs(%31 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%32, %33 : tensor<2x6x5x13xf32>, tensor<1x6x5x5xf32>) outs(%34 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump Before LLVMCPUTileAndFusePass (iree-llvmcpu-tile-and-fuse) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %34 = linalg.fill ins(%cst : f32) outs(%31 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32> %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%32, %33 : tensor<2x6x5x13xf32>, tensor<1x6x5x5xf32>) outs(%34 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32> flow.dispatch.tensor.store %35, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump After LLVMCPUTileAndFusePass (iree-llvmcpu-tile-and-fuse) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32> %37 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%extracted_slice, %33 : tensor<1x6x5x7xf32>, tensor<1x6x5x5xf32>) outs(%36 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32> %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump Before FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32> %37 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%extracted_slice, %33 : tensor<1x6x5x7xf32>, tensor<1x6x5x5xf32>) outs(%36 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32> %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32> %37 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%extracted_slice, %33 : tensor<1x6x5x7xf32>, tensor<1x6x5x5xf32>) outs(%36 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32> %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump Before ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32> %37 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%extracted_slice, %33 : tensor<1x6x5x7xf32>, tensor<1x6x5x5xf32>) outs(%36 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32> %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32> %37 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%extracted_slice, %33 : tensor<1x6x5x7xf32>, tensor<1x6x5x5xf32>) outs(%36 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32> %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump Before LLVMCPUTileRootAndFuseProducerConsumerPass (iree-llvmcpu-tile-root-and-fuse-producer-consumer) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32> %37 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%extracted_slice, %33 : tensor<1x6x5x7xf32>, tensor<1x6x5x5xf32>) outs(%36 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32> %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump After LLVMCPUTileRootAndFuseProducerConsumerPass (iree-llvmcpu-tile-root-and-fuse-producer-consumer) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32> %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %36) -> (tensor<1x1x1x3xf32>) { %38 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x1x3xf32>) { %extracted_slice_1 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32> %extracted_slice_2 = tensor.extract_slice %33[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32> %39 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%extracted_slice_1, %extracted_slice_2 : tensor<1x6x1x3xf32>, tensor<1x6x1x1xf32>) outs(%arg9 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32> scf.yield %39 : tensor<1x1x1x3xf32> } scf.yield %38 : tensor<1x1x1x3xf32> } %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump Before LLVMCPUTileAndFusePass (iree-llvmcpu-tile-and-fuse) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32> %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %36) -> (tensor<1x1x1x3xf32>) { %38 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x1x3xf32>) { %extracted_slice_1 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32> %extracted_slice_2 = tensor.extract_slice %33[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32> %39 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%extracted_slice_1, %extracted_slice_2 : tensor<1x6x1x3xf32>, tensor<1x6x1x1xf32>) outs(%arg9 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32> scf.yield %39 : tensor<1x1x1x3xf32> } scf.yield %38 : tensor<1x1x1x3xf32> } %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump After LLVMCPUTileAndFusePass (iree-llvmcpu-tile-and-fuse) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32> %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %36) -> (tensor<1x1x1x3xf32>) { %38 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x1x3xf32>) { %extracted_slice_1 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32> %extracted_slice_2 = tensor.extract_slice %33[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32> %39 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%extracted_slice_1, %extracted_slice_2 : tensor<1x6x1x3xf32>, tensor<1x6x1x1xf32>) outs(%arg9 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32> scf.yield %39 : tensor<1x1x1x3xf32> } scf.yield %38 : tensor<1x1x1x3xf32> } %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump Before DecomposeConvolutionToLowerDimOpsPass (iree-codegen-decompose-convolution-to-lower-dim-ops) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32> %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %36) -> (tensor<1x1x1x3xf32>) { %38 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x1x3xf32>) { %extracted_slice_1 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32> %extracted_slice_2 = tensor.extract_slice %33[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32> %39 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%extracted_slice_1, %extracted_slice_2 : tensor<1x6x1x3xf32>, tensor<1x6x1x1xf32>) outs(%arg9 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32> scf.yield %39 : tensor<1x1x1x3xf32> } scf.yield %38 : tensor<1x1x1x3xf32> } %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump After DecomposeConvolutionToLowerDimOpsPass (iree-codegen-decompose-convolution-to-lower-dim-ops) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32> %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %36) -> (tensor<1x1x1x3xf32>) { %38 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x1x3xf32>) { %extracted_slice_1 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32> %extracted_slice_2 = tensor.extract_slice %33[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32> %extracted_slice_3 = tensor.extract_slice %extracted_slice_1[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32> %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32> %extracted_slice_5 = tensor.extract_slice %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x3xf32> %39 = linalg.conv_1d_ncw_fcw {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<1x6x3xf32>, tensor<1x6x1xf32>) outs(%extracted_slice_5 : tensor<1x1x3xf32>) -> tensor<1x1x3xf32> %inserted_slice_6 = tensor.insert_slice %39 into %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x3xf32> into tensor<1x1x1x3xf32> scf.yield %inserted_slice_6 : tensor<1x1x1x3xf32> } scf.yield %38 : tensor<1x1x1x3xf32> } %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump Before FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32> %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %36) -> (tensor<1x1x1x3xf32>) { %38 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x1x3xf32>) { %extracted_slice_1 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32> %extracted_slice_2 = tensor.extract_slice %33[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32> %extracted_slice_3 = tensor.extract_slice %extracted_slice_1[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32> %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32> %extracted_slice_5 = tensor.extract_slice %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x3xf32> %39 = linalg.conv_1d_ncw_fcw {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<1x6x3xf32>, tensor<1x6x1xf32>) outs(%extracted_slice_5 : tensor<1x1x3xf32>) -> tensor<1x1x3xf32> %inserted_slice_6 = tensor.insert_slice %39 into %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x3xf32> into tensor<1x1x1x3xf32> scf.yield %inserted_slice_6 : tensor<1x1x1x3xf32> } scf.yield %38 : tensor<1x1x1x3xf32> } %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32> %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %36) -> (tensor<1x1x1x3xf32>) { %38 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x1x3xf32>) { %extracted_slice_1 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32> %extracted_slice_2 = tensor.extract_slice %33[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32> %extracted_slice_3 = tensor.extract_slice %extracted_slice_1[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32> %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32> %extracted_slice_5 = tensor.extract_slice %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x3xf32> %39 = linalg.conv_1d_ncw_fcw {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<1x6x3xf32>, tensor<1x6x1xf32>) outs(%extracted_slice_5 : tensor<1x1x3xf32>) -> tensor<1x1x3xf32> %inserted_slice_6 = tensor.insert_slice %39 into %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x3xf32> into tensor<1x1x1x3xf32> scf.yield %inserted_slice_6 : tensor<1x1x1x3xf32> } scf.yield %38 : tensor<1x1x1x3xf32> } %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump Before ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32> %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %36) -> (tensor<1x1x1x3xf32>) { %38 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x1x3xf32>) { %extracted_slice_1 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32> %extracted_slice_2 = tensor.extract_slice %33[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32> %extracted_slice_3 = tensor.extract_slice %extracted_slice_1[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32> %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32> %extracted_slice_5 = tensor.extract_slice %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x3xf32> %39 = linalg.conv_1d_ncw_fcw {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<1x6x3xf32>, tensor<1x6x1xf32>) outs(%extracted_slice_5 : tensor<1x1x3xf32>) -> tensor<1x1x3xf32> %inserted_slice_6 = tensor.insert_slice %39 into %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x3xf32> into tensor<1x1x1x3xf32> scf.yield %inserted_slice_6 : tensor<1x1x1x3xf32> } scf.yield %38 : tensor<1x1x1x3xf32> } %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32> %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %36) -> (tensor<1x1x1x3xf32>) { %38 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x1x3xf32>) { %extracted_slice_1 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32> %extracted_slice_2 = tensor.extract_slice %33[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32> %extracted_slice_3 = tensor.extract_slice %extracted_slice_1[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32> %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32> %extracted_slice_5 = tensor.extract_slice %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x3xf32> %39 = linalg.conv_1d_ncw_fcw {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<1x6x3xf32>, tensor<1x6x1xf32>) outs(%extracted_slice_5 : tensor<1x1x3xf32>) -> tensor<1x1x3xf32> %inserted_slice_6 = tensor.insert_slice %39 into %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x3xf32> into tensor<1x1x1x3xf32> scf.yield %inserted_slice_6 : tensor<1x1x1x3xf32> } scf.yield %38 : tensor<1x1x1x3xf32> } %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump Before TensorToVectorVectorizePadPass (iree-codegen-vectorize-tensor-pad) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32> %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %36) -> (tensor<1x1x1x3xf32>) { %38 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x1x3xf32>) { %extracted_slice_1 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32> %extracted_slice_2 = tensor.extract_slice %33[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32> %extracted_slice_3 = tensor.extract_slice %extracted_slice_1[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32> %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32> %extracted_slice_5 = tensor.extract_slice %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x3xf32> %39 = linalg.conv_1d_ncw_fcw {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<1x6x3xf32>, tensor<1x6x1xf32>) outs(%extracted_slice_5 : tensor<1x1x3xf32>) -> tensor<1x1x3xf32> %inserted_slice_6 = tensor.insert_slice %39 into %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x3xf32> into tensor<1x1x1x3xf32> scf.yield %inserted_slice_6 : tensor<1x1x1x3xf32> } scf.yield %38 : tensor<1x1x1x3xf32> } %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump After TensorToVectorVectorizePadPass (iree-codegen-vectorize-tensor-pad) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32> %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %36) -> (tensor<1x1x1x3xf32>) { %38 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x1x3xf32>) { %extracted_slice_1 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32> %extracted_slice_2 = tensor.extract_slice %33[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32> %extracted_slice_3 = tensor.extract_slice %extracted_slice_1[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32> %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32> %extracted_slice_5 = tensor.extract_slice %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x3xf32> %39 = linalg.conv_1d_ncw_fcw {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<1x6x3xf32>, tensor<1x6x1xf32>) outs(%extracted_slice_5 : tensor<1x1x3xf32>) -> tensor<1x1x3xf32> %inserted_slice_6 = tensor.insert_slice %39 into %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x3xf32> into tensor<1x1x1x3xf32> scf.yield %inserted_slice_6 : tensor<1x1x1x3xf32> } scf.yield %38 : tensor<1x1x1x3xf32> } %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump Before GenericVectorizationPass (iree-codegen-generic-vectorization) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32> %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %36) -> (tensor<1x1x1x3xf32>) { %38 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x1x3xf32>) { %extracted_slice_1 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32> %extracted_slice_2 = tensor.extract_slice %33[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32> %extracted_slice_3 = tensor.extract_slice %extracted_slice_1[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32> %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32> %extracted_slice_5 = tensor.extract_slice %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x3xf32> %39 = linalg.conv_1d_ncw_fcw {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<1x6x3xf32>, tensor<1x6x1xf32>) outs(%extracted_slice_5 : tensor<1x1x3xf32>) -> tensor<1x1x3xf32> %inserted_slice_6 = tensor.insert_slice %39 into %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x3xf32> into tensor<1x1x1x3xf32> scf.yield %inserted_slice_6 : tensor<1x1x1x3xf32> } scf.yield %38 : tensor<1x1x1x3xf32> } %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump After GenericVectorizationPass (iree-codegen-generic-vectorization) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_0 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_1 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x3xf32>, tensor<1x1x1x3xf32> %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %36) -> (tensor<1x1x1x3xf32>) { %38 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x1x3xf32>) { %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32> %extracted_slice_3 = tensor.extract_slice %33[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32> %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32> %extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32> %extracted_slice_6 = tensor.extract_slice %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x3xf32> %39 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x3xf32>, vector<1x6x3xf32> %40 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x1xf32>, vector<1x6x1xf32> %41 = vector.transfer_read %extracted_slice_6[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1x3xf32>, vector<1x1x3xf32> %42 = vector.transpose %40, [2, 1, 0] : vector<1x6x1xf32> to vector<1x6x1xf32> %43 = vector.extract %42[0] : vector<6x1xf32> from vector<1x6x1xf32> %44 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind} %39, %43, %41 : vector<1x6x3xf32>, vector<6x1xf32> into vector<1x1x3xf32> %45 = vector.transfer_write %44, %extracted_slice_6[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x3xf32> %inserted_slice_7 = tensor.insert_slice %45 into %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x3xf32> into tensor<1x1x1x3xf32> scf.yield %inserted_slice_7 : tensor<1x1x1x3xf32> } scf.yield %38 : tensor<1x1x1x3xf32> } %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump Before OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_0 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_1 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x3xf32>, tensor<1x1x1x3xf32> %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %36) -> (tensor<1x1x1x3xf32>) { %38 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x1x3xf32>) { %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32> %extracted_slice_3 = tensor.extract_slice %33[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32> %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32> %extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32> %extracted_slice_6 = tensor.extract_slice %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x3xf32> %39 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x3xf32>, vector<1x6x3xf32> %40 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x1xf32>, vector<1x6x1xf32> %41 = vector.transfer_read %extracted_slice_6[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1x3xf32>, vector<1x1x3xf32> %42 = vector.transpose %40, [2, 1, 0] : vector<1x6x1xf32> to vector<1x6x1xf32> %43 = vector.extract %42[0] : vector<6x1xf32> from vector<1x6x1xf32> %44 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind} %39, %43, %41 : vector<1x6x3xf32>, vector<6x1xf32> into vector<1x1x3xf32> %45 = vector.transfer_write %44, %extracted_slice_6[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x3xf32> %inserted_slice_7 = tensor.insert_slice %45 into %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x3xf32> into tensor<1x1x1x3xf32> scf.yield %inserted_slice_7 : tensor<1x1x1x3xf32> } scf.yield %38 : tensor<1x1x1x3xf32> } %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_0 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { %31 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %32 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %33 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %32) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %33[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_1 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x3xf32>, tensor<1x1x1x3xf32> %37 = vector.transfer_read %36[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1x1x3xf32>, vector<1x1x3xf32> %38 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %37) -> (vector<1x1x3xf32>) { %40 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) { %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32> %extracted_slice_3 = tensor.extract_slice %31[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32> %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32> %extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32> %41 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x3xf32>, vector<1x6x3xf32> %42 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x1xf32>, vector<1x6x1xf32> %43 = vector.transpose %42, [2, 1, 0] : vector<1x6x1xf32> to vector<1x6x1xf32> %44 = vector.extract %43[0] : vector<6x1xf32> from vector<1x6x1xf32> %45 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind} %41, %44, %arg9 : vector<1x6x3xf32>, vector<6x1xf32> into vector<1x1x3xf32> scf.yield %45 : vector<1x1x3xf32> } scf.yield %40 : vector<1x1x3xf32> } %39 = vector.transfer_write %38, %36[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x1x3xf32> %inserted_slice = tensor.insert_slice %39 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_0 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { %31 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %32 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %33 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %32) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %33[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_1 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x3xf32>, tensor<1x1x1x3xf32> %37 = vector.transfer_read %36[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1x1x3xf32>, vector<1x1x3xf32> %38 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %37) -> (vector<1x1x3xf32>) { %40 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) { %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32> %extracted_slice_3 = tensor.extract_slice %31[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32> %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32> %extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32> %41 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x3xf32>, vector<1x6x3xf32> %42 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x1xf32>, vector<1x6x1xf32> %43 = vector.transpose %42, [2, 1, 0] : vector<1x6x1xf32> to vector<1x6x1xf32> %44 = vector.extract %43[0] : vector<6x1xf32> from vector<1x6x1xf32> %45 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind} %41, %44, %arg9 : vector<1x6x3xf32>, vector<6x1xf32> into vector<1x1x3xf32> scf.yield %45 : vector<1x1x3xf32> } scf.yield %40 : vector<1x1x3xf32> } %39 = vector.transfer_write %38, %36[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x1x3xf32> %inserted_slice = tensor.insert_slice %39 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump After Canonicalizer (canonicalize) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_0 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { %31 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %32 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %33 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %32) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %33[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_1 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x3xf32>, tensor<1x1x1x3xf32> %37 = vector.transfer_read %36[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1x1x3xf32>, vector<1x1x3xf32> %38 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %37) -> (vector<1x1x3xf32>) { %40 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) { %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32> %extracted_slice_3 = tensor.extract_slice %31[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32> %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32> %extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32> %41 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x3xf32>, vector<1x6x3xf32> %42 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x1xf32>, vector<1x6x1xf32> %43 = vector.transpose %42, [2, 1, 0] : vector<1x6x1xf32> to vector<1x6x1xf32> %44 = vector.extract %43[0] : vector<6x1xf32> from vector<1x6x1xf32> %45 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind} %41, %44, %arg9 : vector<1x6x3xf32>, vector<6x1xf32> into vector<1x1x3xf32> scf.yield %45 : vector<1x1x3xf32> } scf.yield %40 : vector<1x1x3xf32> } %39 = vector.transfer_write %38, %36[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x1x3xf32> %inserted_slice = tensor.insert_slice %39 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump Before CSE (cse) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_0 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { %31 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %32 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %33 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %32) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %33[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_1 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x3xf32>, tensor<1x1x1x3xf32> %37 = vector.transfer_read %36[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1x1x3xf32>, vector<1x1x3xf32> %38 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %37) -> (vector<1x1x3xf32>) { %40 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) { %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32> %extracted_slice_3 = tensor.extract_slice %31[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32> %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32> %extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32> %41 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x3xf32>, vector<1x6x3xf32> %42 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x1xf32>, vector<1x6x1xf32> %43 = vector.transpose %42, [2, 1, 0] : vector<1x6x1xf32> to vector<1x6x1xf32> %44 = vector.extract %43[0] : vector<6x1xf32> from vector<1x6x1xf32> %45 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind} %41, %44, %arg9 : vector<1x6x3xf32>, vector<6x1xf32> into vector<1x1x3xf32> scf.yield %45 : vector<1x1x3xf32> } scf.yield %40 : vector<1x1x3xf32> } %39 = vector.transfer_write %38, %36[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x1x3xf32> %inserted_slice = tensor.insert_slice %39 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump After CSE (cse) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_0 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { %31 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %32 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %33 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %32) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %33[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_1 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x3xf32>, tensor<1x1x1x3xf32> %37 = vector.transfer_read %36[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1x1x3xf32>, vector<1x1x3xf32> %38 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %37) -> (vector<1x1x3xf32>) { %40 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) { %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32> %extracted_slice_3 = tensor.extract_slice %31[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32> %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32> %extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32> %41 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x3xf32>, vector<1x6x3xf32> %42 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x1xf32>, vector<1x6x1xf32> %43 = vector.transpose %42, [2, 1, 0] : vector<1x6x1xf32> to vector<1x6x1xf32> %44 = vector.extract %43[0] : vector<6x1xf32> from vector<1x6x1xf32> %45 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind} %41, %44, %arg9 : vector<1x6x3xf32>, vector<6x1xf32> into vector<1x1x3xf32> scf.yield %45 : vector<1x1x3xf32> } scf.yield %40 : vector<1x1x3xf32> } %39 = vector.transfer_write %38, %36[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x1x3xf32> %inserted_slice = tensor.insert_slice %39 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump Before LLVMCPUVerifyVectorSizeLegalityPass (iree-llvmcpu-verify-vector-size-legality) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_0 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { %31 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %32 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %33 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %32) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %33[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_1 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x3xf32>, tensor<1x1x1x3xf32> %37 = vector.transfer_read %36[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1x1x3xf32>, vector<1x1x3xf32> %38 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %37) -> (vector<1x1x3xf32>) { %40 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) { %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32> %extracted_slice_3 = tensor.extract_slice %31[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32> %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32> %extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32> %41 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x3xf32>, vector<1x6x3xf32> %42 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x1xf32>, vector<1x6x1xf32> %43 = vector.transpose %42, [2, 1, 0] : vector<1x6x1xf32> to vector<1x6x1xf32> %44 = vector.extract %43[0] : vector<6x1xf32> from vector<1x6x1xf32> %45 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind} %41, %44, %arg9 : vector<1x6x3xf32>, vector<6x1xf32> into vector<1x1x3xf32> scf.yield %45 : vector<1x1x3xf32> } scf.yield %40 : vector<1x1x3xf32> } %39 = vector.transfer_write %38, %36[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x1x3xf32> %inserted_slice = tensor.insert_slice %39 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump After LLVMCPUVerifyVectorSizeLegalityPass (iree-llvmcpu-verify-vector-size-legality) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_0 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { %31 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %32 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %33 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %32) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %33[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_1 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x3xf32>, tensor<1x1x1x3xf32> %37 = vector.transfer_read %36[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1x1x3xf32>, vector<1x1x3xf32> %38 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %37) -> (vector<1x1x3xf32>) { %40 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) { %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32> %extracted_slice_3 = tensor.extract_slice %31[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32> %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32> %extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32> %41 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x3xf32>, vector<1x6x3xf32> %42 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x1xf32>, vector<1x6x1xf32> %43 = vector.transpose %42, [2, 1, 0] : vector<1x6x1xf32> to vector<1x6x1xf32> %44 = vector.extract %43[0] : vector<6x1xf32> from vector<1x6x1xf32> %45 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind} %41, %44, %arg9 : vector<1x6x3xf32>, vector<6x1xf32> into vector<1x1x3xf32> scf.yield %45 : vector<1x1x3xf32> } scf.yield %40 : vector<1x1x3xf32> } %39 = vector.transfer_write %38, %36[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x1x3xf32> %inserted_slice = tensor.insert_slice %39 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump Before OptimizeVectorTransferPass (iree-codegen-optimize-vector-transfer) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_0 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { %31 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %32 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %33 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %32) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %33[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_1 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x3xf32>, tensor<1x1x1x3xf32> %37 = vector.transfer_read %36[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1x1x3xf32>, vector<1x1x3xf32> %38 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %37) -> (vector<1x1x3xf32>) { %40 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) { %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32> %extracted_slice_3 = tensor.extract_slice %31[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32> %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32> %extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32> %41 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x3xf32>, vector<1x6x3xf32> %42 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x1xf32>, vector<1x6x1xf32> %43 = vector.transpose %42, [2, 1, 0] : vector<1x6x1xf32> to vector<1x6x1xf32> %44 = vector.extract %43[0] : vector<6x1xf32> from vector<1x6x1xf32> %45 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind} %41, %44, %arg9 : vector<1x6x3xf32>, vector<6x1xf32> into vector<1x1x3xf32> scf.yield %45 : vector<1x1x3xf32> } scf.yield %40 : vector<1x1x3xf32> } %39 = vector.transfer_write %38, %36[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x1x3xf32> %inserted_slice = tensor.insert_slice %39 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump After OptimizeVectorTransferPass (iree-codegen-optimize-vector-transfer) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_1 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { %31 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %32 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %33 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %32) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %33[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_2 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = vector.transfer_write %cst_0, %extracted_slice_2[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, tensor<1x1x1x3xf32> %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %cst) -> (vector<1x1x3xf32>) { %40 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) { %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32> %extracted_slice_4 = tensor.extract_slice %31[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32> %extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32> %extracted_slice_6 = tensor.extract_slice %extracted_slice_4[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32> %41 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<1x6x3xf32>, vector<6x3xf32> %42 = vector.transfer_read %extracted_slice_6[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<1x6x1xf32>, vector<6x1xf32> %43 = vector.shape_cast %42 : vector<6x1xf32> to vector<6xf32> %44 = vector.extract %arg9[0, 0] : vector<3xf32> from vector<1x1x3xf32> %45 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind} %41, %43, %44 : vector<6x3xf32>, vector<6xf32> into vector<3xf32> %46 = vector.broadcast %45 : vector<3xf32> to vector<1x3xf32> %47 = vector.broadcast %46 : vector<1x3xf32> to vector<1x1x3xf32> scf.yield %47 : vector<1x1x3xf32> } scf.yield %40 : vector<1x1x3xf32> } %38 = vector.extract %37[0, 0] : vector<3xf32> from vector<1x1x3xf32> %39 = vector.transfer_write %38, %36[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, tensor<1x1x1x3xf32> %inserted_slice = tensor.insert_slice %39 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump Before EliminateEmptyTensorsPass (iree-eliminate-empty-tensors) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_1 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { %31 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %32 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %33 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %32) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %33[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_2 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = vector.transfer_write %cst_0, %extracted_slice_2[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, tensor<1x1x1x3xf32> %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %cst) -> (vector<1x1x3xf32>) { %40 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) { %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32> %extracted_slice_4 = tensor.extract_slice %31[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32> %extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32> %extracted_slice_6 = tensor.extract_slice %extracted_slice_4[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32> %41 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<1x6x3xf32>, vector<6x3xf32> %42 = vector.transfer_read %extracted_slice_6[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<1x6x1xf32>, vector<6x1xf32> %43 = vector.shape_cast %42 : vector<6x1xf32> to vector<6xf32> %44 = vector.extract %arg9[0, 0] : vector<3xf32> from vector<1x1x3xf32> %45 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind} %41, %43, %44 : vector<6x3xf32>, vector<6xf32> into vector<3xf32> %46 = vector.broadcast %45 : vector<3xf32> to vector<1x3xf32> %47 = vector.broadcast %46 : vector<1x3xf32> to vector<1x1x3xf32> scf.yield %47 : vector<1x1x3xf32> } scf.yield %40 : vector<1x1x3xf32> } %38 = vector.extract %37[0, 0] : vector<3xf32> from vector<1x1x3xf32> %39 = vector.transfer_write %38, %36[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, tensor<1x1x1x3xf32> %inserted_slice = tensor.insert_slice %39 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump After EliminateEmptyTensorsPass (iree-eliminate-empty-tensors) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_1 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { %31 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %32 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %33 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %32) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %33[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_2 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = vector.transfer_write %cst_0, %extracted_slice_2[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, tensor<1x1x1x3xf32> %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %cst) -> (vector<1x1x3xf32>) { %40 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) { %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32> %extracted_slice_4 = tensor.extract_slice %31[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32> %extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32> %extracted_slice_6 = tensor.extract_slice %extracted_slice_4[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32> %41 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<1x6x3xf32>, vector<6x3xf32> %42 = vector.transfer_read %extracted_slice_6[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<1x6x1xf32>, vector<6x1xf32> %43 = vector.shape_cast %42 : vector<6x1xf32> to vector<6xf32> %44 = vector.extract %arg9[0, 0] : vector<3xf32> from vector<1x1x3xf32> %45 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind} %41, %43, %44 : vector<6x3xf32>, vector<6xf32> into vector<3xf32> %46 = vector.broadcast %45 : vector<3xf32> to vector<1x3xf32> %47 = vector.broadcast %46 : vector<1x3xf32> to vector<1x1x3xf32> scf.yield %47 : vector<1x1x3xf32> } scf.yield %40 : vector<1x1x3xf32> } %38 = vector.extract %37[0, 0] : vector<3xf32> from vector<1x1x3xf32> %39 = vector.transfer_write %38, %36[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, tensor<1x1x1x3xf32> %inserted_slice = tensor.insert_slice %39 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump Before EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_1 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { %31 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %32 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %33 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %32) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %33[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_2 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = vector.transfer_write %cst_0, %extracted_slice_2[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, tensor<1x1x1x3xf32> %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %cst) -> (vector<1x1x3xf32>) { %40 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) { %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32> %extracted_slice_4 = tensor.extract_slice %31[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32> %extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32> %extracted_slice_6 = tensor.extract_slice %extracted_slice_4[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32> %41 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<1x6x3xf32>, vector<6x3xf32> %42 = vector.transfer_read %extracted_slice_6[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<1x6x1xf32>, vector<6x1xf32> %43 = vector.shape_cast %42 : vector<6x1xf32> to vector<6xf32> %44 = vector.extract %arg9[0, 0] : vector<3xf32> from vector<1x1x3xf32> %45 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind} %41, %43, %44 : vector<6x3xf32>, vector<6xf32> into vector<3xf32> %46 = vector.broadcast %45 : vector<3xf32> to vector<1x3xf32> %47 = vector.broadcast %46 : vector<1x3xf32> to vector<1x1x3xf32> scf.yield %47 : vector<1x1x3xf32> } scf.yield %40 : vector<1x1x3xf32> } %38 = vector.extract %37[0, 0] : vector<3xf32> from vector<1x1x3xf32> %39 = vector.transfer_write %38, %36[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, tensor<1x1x1x3xf32> %inserted_slice = tensor.insert_slice %39 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_1 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { %31 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %32 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %33 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %32) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %33[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_2 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = vector.transfer_write %cst_0, %extracted_slice_2[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, tensor<1x1x1x3xf32> %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %cst) -> (vector<1x1x3xf32>) { %40 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) { %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32> %extracted_slice_4 = tensor.extract_slice %31[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32> %extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32> %extracted_slice_6 = tensor.extract_slice %extracted_slice_4[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32> %41 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<1x6x3xf32>, vector<6x3xf32> %42 = vector.transfer_read %extracted_slice_6[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<1x6x1xf32>, vector<6x1xf32> %43 = vector.shape_cast %42 : vector<6x1xf32> to vector<6xf32> %44 = vector.extract %arg9[0, 0] : vector<3xf32> from vector<1x1x3xf32> %45 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind} %41, %43, %44 : vector<6x3xf32>, vector<6xf32> into vector<3xf32> %46 = vector.broadcast %45 : vector<3xf32> to vector<1x3xf32> %47 = vector.broadcast %46 : vector<1x3xf32> to vector<1x1x3xf32> scf.yield %47 : vector<1x1x3xf32> } scf.yield %40 : vector<1x1x3xf32> } %38 = vector.extract %37[0, 0] : vector<3xf32> from vector<1x1x3xf32> %39 = vector.transfer_write %38, %36[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, tensor<1x1x1x3xf32> %inserted_slice = tensor.insert_slice %39 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump Before IREEComprehensiveBufferizePass (iree-codegen-iree-comprehensive-bufferize) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_1 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor>{%12, %17, %22, %27} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { %31 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x6x5x5xf32> scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %32 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1x1x9xf32> %33 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32> %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %32) -> (tensor<2x1x1x9xf32>) { %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) { %extracted_slice = tensor.extract_slice %33[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32> %extracted_slice_2 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32> %36 = vector.transfer_write %cst_0, %extracted_slice_2[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, tensor<1x1x1x3xf32> %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %cst) -> (vector<1x1x3xf32>) { %40 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) { %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32> %extracted_slice_4 = tensor.extract_slice %31[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32> %extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32> %extracted_slice_6 = tensor.extract_slice %extracted_slice_4[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32> %41 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<1x6x3xf32>, vector<6x3xf32> %42 = vector.transfer_read %extracted_slice_6[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<1x6x1xf32>, vector<6x1xf32> %43 = vector.shape_cast %42 : vector<6x1xf32> to vector<6xf32> %44 = vector.extract %arg9[0, 0] : vector<3xf32> from vector<1x1x3xf32> %45 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind} %41, %43, %44 : vector<6x3xf32>, vector<6xf32> into vector<3xf32> %46 = vector.broadcast %45 : vector<3xf32> to vector<1x3xf32> %47 = vector.broadcast %46 : vector<1x3xf32> to vector<1x1x3xf32> scf.yield %47 : vector<1x1x3xf32> } scf.yield %40 : vector<1x1x3xf32> } %38 = vector.extract %37[0, 0] : vector<3xf32> from vector<1x1x3xf32> %39 = vector.transfer_write %38, %36[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, tensor<1x1x1x3xf32> %inserted_slice = tensor.insert_slice %39 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32> scf.yield %inserted_slice : tensor<2x1x1x9xf32> } scf.yield %35 : tensor<2x1x1x9xf32> } flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor> } } return } // -----// IR Dump After IREEComprehensiveBufferizePass (iree-codegen-iree-comprehensive-bufferize) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_1 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref>{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { %subview = memref.subview %28[%arg0, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %subview_2 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_3 = memref.subview %30[0, 0, %arg1, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %31 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %subview_2) -> (memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) { %32 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) { %subview_5 = memref.subview %subview_3[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_6 = memref.subview %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> vector.transfer_write %cst_0, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %33 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %cst) -> (vector<1x1x3xf32>) { %35 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) { %subview_8 = memref.subview %subview_5[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_9 = memref.subview %subview[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_10 = memref.subview %subview_8[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_11 = memref.subview %subview_9[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> %36 = vector.transfer_read %subview_10[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type>, vector<6x3xf32> %37 = vector.transfer_read %subview_11[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type>, vector<6x1xf32> %38 = vector.shape_cast %37 : vector<6x1xf32> to vector<6xf32> %39 = vector.extract %arg9[0, 0] : vector<3xf32> from vector<1x1x3xf32> %40 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind} %36, %38, %39 : vector<6x3xf32>, vector<6xf32> into vector<3xf32> %41 = vector.broadcast %40 : vector<3xf32> to vector<1x3xf32> %42 = vector.broadcast %41 : vector<1x3xf32> to vector<1x1x3xf32> scf.yield %42 : vector<1x1x3xf32> } scf.yield %35 : vector<1x1x3xf32> } %34 = vector.extract %33[0, 0] : vector<3xf32> from vector<1x1x3xf32> vector.transfer_write %34, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_7 = memref.subview %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_6 : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) outs(%subview_7 : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } scf.yield %arg5 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> } scf.yield %32 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> } %subview_4 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%31 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) outs(%subview_4 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } } return } // -----// IR Dump Before ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_1 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref>{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { %subview = memref.subview %28[%arg0, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %subview_2 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_3 = memref.subview %30[0, 0, %arg1, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %31 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %subview_2) -> (memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) { %32 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) { %subview_5 = memref.subview %subview_3[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_6 = memref.subview %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> vector.transfer_write %cst_0, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %33 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %cst) -> (vector<1x1x3xf32>) { %35 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) { %subview_8 = memref.subview %subview_5[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_9 = memref.subview %subview[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_10 = memref.subview %subview_8[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_11 = memref.subview %subview_9[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> %36 = vector.transfer_read %subview_10[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type>, vector<6x3xf32> %37 = vector.transfer_read %subview_11[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type>, vector<6x1xf32> %38 = vector.shape_cast %37 : vector<6x1xf32> to vector<6xf32> %39 = vector.extract %arg9[0, 0] : vector<3xf32> from vector<1x1x3xf32> %40 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind} %36, %38, %39 : vector<6x3xf32>, vector<6xf32> into vector<3xf32> %41 = vector.broadcast %40 : vector<3xf32> to vector<1x3xf32> %42 = vector.broadcast %41 : vector<1x3xf32> to vector<1x1x3xf32> scf.yield %42 : vector<1x1x3xf32> } scf.yield %35 : vector<1x1x3xf32> } %34 = vector.extract %33[0, 0] : vector<3xf32> from vector<1x1x3xf32> vector.transfer_write %34, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_7 = memref.subview %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_6 : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) outs(%subview_7 : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } scf.yield %arg5 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> } scf.yield %32 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> } %subview_4 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%31 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) outs(%subview_4 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } } return } // -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_1 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref>{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { %subview = memref.subview %28[%arg0, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %subview_2 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_3 = memref.subview %30[0, 0, %arg1, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %31 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %subview_2) -> (memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) { %32 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) { %subview_5 = memref.subview %subview_3[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_6 = memref.subview %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> vector.transfer_write %cst_0, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %33 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %cst) -> (vector<1x1x3xf32>) { %35 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) { %subview_8 = memref.subview %subview_5[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_9 = memref.subview %subview[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_10 = memref.subview %subview_8[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_11 = memref.subview %subview_9[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> %36 = vector.transfer_read %subview_10[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type>, vector<6x3xf32> %37 = vector.transfer_read %subview_11[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type>, vector<6x1xf32> %38 = vector.shape_cast %37 : vector<6x1xf32> to vector<6xf32> %39 = vector.extract %arg9[0, 0] : vector<3xf32> from vector<1x1x3xf32> %40 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind} %36, %38, %39 : vector<6x3xf32>, vector<6xf32> into vector<3xf32> %41 = vector.broadcast %40 : vector<3xf32> to vector<1x3xf32> %42 = vector.broadcast %41 : vector<1x3xf32> to vector<1x1x3xf32> scf.yield %42 : vector<1x1x3xf32> } scf.yield %35 : vector<1x1x3xf32> } %34 = vector.extract %33[0, 0] : vector<3xf32> from vector<1x1x3xf32> vector.transfer_write %34, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_7 = memref.subview %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_6 : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) outs(%subview_7 : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } scf.yield %arg5 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> } scf.yield %32 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> } %subview_4 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%31 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) outs(%subview_4 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } } return } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_1 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref>{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { %subview = memref.subview %28[%arg0, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %subview_2 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_3 = memref.subview %30[0, 0, %arg1, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %31 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %subview_2) -> (memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) { %32 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) { %subview_5 = memref.subview %subview_3[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_6 = memref.subview %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> vector.transfer_write %cst_0, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %33 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %cst) -> (vector<1x1x3xf32>) { %35 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) { %subview_8 = memref.subview %subview_5[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_9 = memref.subview %subview[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_10 = memref.subview %subview_8[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_11 = memref.subview %subview_9[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> %36 = vector.transfer_read %subview_10[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type>, vector<6x3xf32> %37 = vector.transfer_read %subview_11[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type>, vector<6x1xf32> %38 = vector.shape_cast %37 : vector<6x1xf32> to vector<6xf32> %39 = vector.extract %arg9[0, 0] : vector<3xf32> from vector<1x1x3xf32> %40 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind} %36, %38, %39 : vector<6x3xf32>, vector<6xf32> into vector<3xf32> %41 = vector.broadcast %40 : vector<3xf32> to vector<1x3xf32> %42 = vector.broadcast %41 : vector<1x3xf32> to vector<1x1x3xf32> scf.yield %42 : vector<1x1x3xf32> } scf.yield %35 : vector<1x1x3xf32> } %34 = vector.extract %33[0, 0] : vector<3xf32> from vector<1x1x3xf32> vector.transfer_write %34, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_7 = memref.subview %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_6 : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) outs(%subview_7 : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } scf.yield %arg5 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> } scf.yield %32 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> } %subview_4 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%31 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) outs(%subview_4 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } } return } // -----// IR Dump After Canonicalizer (canonicalize) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_1 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref>{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { %subview = memref.subview %28[%arg0, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %subview_2 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_3 = memref.subview %30[0, 0, %arg1, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> scf.for %arg2 = %c0 to %c2 step %c1 { scf.for %arg3 = %c0 to %c9 step %c3 { %subview_5 = memref.subview %subview_3[%arg2, 0, 0, %arg3] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_6 = memref.subview %subview_2[%arg2, 0, 0, %arg3] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> vector.transfer_write %cst_0, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %31 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %arg5) -> (vector<1x1x3xf32>) { %subview_8 = memref.subview %subview_5[0, 0, %arg4, %arg6] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_9 = memref.subview %subview[0, 0, %arg4, %arg6] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_10 = memref.subview %subview_8[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_11 = memref.subview %subview_9[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> %34 = vector.transfer_read %subview_10[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type>, vector<6x3xf32> %35 = vector.transfer_read %subview_11[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type>, vector<6x1xf32> %36 = vector.shape_cast %35 : vector<6x1xf32> to vector<6xf32> %37 = vector.extract %arg7[0, 0] : vector<3xf32> from vector<1x1x3xf32> %38 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind} %34, %36, %37 : vector<6x3xf32>, vector<6xf32> into vector<3xf32> %39 = vector.broadcast %38 : vector<3xf32> to vector<1x1x3xf32> scf.yield %39 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> vector.transfer_write %32, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_7 = memref.subview %subview_2[%arg2, 0, 0, %arg3] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_6 : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) outs(%subview_7 : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } } %subview_4 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_2 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) outs(%subview_4 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } } return } // -----// IR Dump Before CSE (cse) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_1 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref>{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { %subview = memref.subview %28[%arg0, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %subview_2 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_3 = memref.subview %30[0, 0, %arg1, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> scf.for %arg2 = %c0 to %c2 step %c1 { scf.for %arg3 = %c0 to %c9 step %c3 { %subview_5 = memref.subview %subview_3[%arg2, 0, 0, %arg3] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_6 = memref.subview %subview_2[%arg2, 0, 0, %arg3] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> vector.transfer_write %cst_0, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %31 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %arg5) -> (vector<1x1x3xf32>) { %subview_8 = memref.subview %subview_5[0, 0, %arg4, %arg6] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_9 = memref.subview %subview[0, 0, %arg4, %arg6] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_10 = memref.subview %subview_8[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_11 = memref.subview %subview_9[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> %34 = vector.transfer_read %subview_10[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type>, vector<6x3xf32> %35 = vector.transfer_read %subview_11[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type>, vector<6x1xf32> %36 = vector.shape_cast %35 : vector<6x1xf32> to vector<6xf32> %37 = vector.extract %arg7[0, 0] : vector<3xf32> from vector<1x1x3xf32> %38 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind} %34, %36, %37 : vector<6x3xf32>, vector<6xf32> into vector<3xf32> %39 = vector.broadcast %38 : vector<3xf32> to vector<1x1x3xf32> scf.yield %39 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> vector.transfer_write %32, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_7 = memref.subview %subview_2[%arg2, 0, 0, %arg3] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_6 : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) outs(%subview_7 : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } } %subview_4 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_2 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) outs(%subview_4 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } } return } // -----// IR Dump After CSE (cse) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_1 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref>{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { %subview = memref.subview %28[%arg0, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %subview_2 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_3 = memref.subview %30[0, 0, %arg1, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> scf.for %arg2 = %c0 to %c2 step %c1 { scf.for %arg3 = %c0 to %c9 step %c3 { %subview_4 = memref.subview %subview_3[%arg2, 0, 0, %arg3] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_5 = memref.subview %subview_2[%arg2, 0, 0, %arg3] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> vector.transfer_write %cst_0, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %31 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %arg5) -> (vector<1x1x3xf32>) { %subview_6 = memref.subview %subview_4[0, 0, %arg4, %arg6] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_7 = memref.subview %subview[0, 0, %arg4, %arg6] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_9 = memref.subview %subview_7[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> %34 = vector.transfer_read %subview_8[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type>, vector<6x3xf32> %35 = vector.transfer_read %subview_9[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type>, vector<6x1xf32> %36 = vector.shape_cast %35 : vector<6x1xf32> to vector<6xf32> %37 = vector.extract %arg7[0, 0] : vector<3xf32> from vector<1x1x3xf32> %38 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind} %34, %36, %37 : vector<6x3xf32>, vector<6xf32> into vector<3xf32> %39 = vector.broadcast %38 : vector<3xf32> to vector<1x1x3xf32> scf.yield %39 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> vector.transfer_write %32, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_5 : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) outs(%subview_5 : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } } linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_2 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) outs(%subview_2 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } } return } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_1 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref>{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { %subview = memref.subview %28[%arg0, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %subview_2 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_3 = memref.subview %30[0, 0, %arg1, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> scf.for %arg2 = %c0 to %c2 step %c1 { scf.for %arg3 = %c0 to %c9 step %c3 { %subview_4 = memref.subview %subview_3[%arg2, 0, 0, %arg3] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_5 = memref.subview %subview_2[%arg2, 0, 0, %arg3] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> vector.transfer_write %cst_0, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %31 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %arg5) -> (vector<1x1x3xf32>) { %subview_6 = memref.subview %subview_4[0, 0, %arg4, %arg6] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_7 = memref.subview %subview[0, 0, %arg4, %arg6] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_9 = memref.subview %subview_7[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> %34 = vector.transfer_read %subview_8[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type>, vector<6x3xf32> %35 = vector.transfer_read %subview_9[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type>, vector<6x1xf32> %36 = vector.shape_cast %35 : vector<6x1xf32> to vector<6xf32> %37 = vector.extract %arg7[0, 0] : vector<3xf32> from vector<1x1x3xf32> %38 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind} %34, %36, %37 : vector<6x3xf32>, vector<6xf32> into vector<3xf32> %39 = vector.broadcast %38 : vector<3xf32> to vector<1x1x3xf32> scf.yield %39 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> vector.transfer_write %32, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_5 : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) outs(%subview_5 : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } } linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_2 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) outs(%subview_2 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type>) { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } } return } // -----// IR Dump After Canonicalizer (canonicalize) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_1 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref>{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { %subview = memref.subview %28[%arg0, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %subview_2 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_3 = memref.subview %30[0, 0, %arg1, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> scf.for %arg2 = %c0 to %c2 step %c1 { scf.for %arg3 = %c0 to %c9 step %c3 { %subview_4 = memref.subview %subview_3[%arg2, 0, 0, %arg3] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_5 = memref.subview %subview_2[%arg2, 0, 0, %arg3] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> vector.transfer_write %cst_0, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %31 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %arg5) -> (vector<1x1x3xf32>) { %subview_6 = memref.subview %subview_4[0, 0, %arg4, %arg6] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_7 = memref.subview %subview[0, 0, %arg4, %arg6] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_9 = memref.subview %subview_7[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> %34 = vector.transfer_read %subview_8[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type>, vector<6x3xf32> %35 = vector.transfer_read %subview_9[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type>, vector<6x1xf32> %36 = vector.shape_cast %35 : vector<6x1xf32> to vector<6xf32> %37 = vector.extract %arg7[0, 0] : vector<3xf32> from vector<1x1x3xf32> %38 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind} %34, %36, %37 : vector<6x3xf32>, vector<6xf32> into vector<3xf32> %39 = vector.broadcast %38 : vector<3xf32> to vector<1x1x3xf32> scf.yield %39 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> vector.transfer_write %32, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> } } } } return } // -----// IR Dump Before CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_1 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref>{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { %subview = memref.subview %28[%arg0, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %subview_2 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_3 = memref.subview %30[0, 0, %arg1, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> scf.for %arg2 = %c0 to %c2 step %c1 { scf.for %arg3 = %c0 to %c9 step %c3 { %subview_4 = memref.subview %subview_3[%arg2, 0, 0, %arg3] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_5 = memref.subview %subview_2[%arg2, 0, 0, %arg3] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> vector.transfer_write %cst_0, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %31 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %arg5) -> (vector<1x1x3xf32>) { %subview_6 = memref.subview %subview_4[0, 0, %arg4, %arg6] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_7 = memref.subview %subview[0, 0, %arg4, %arg6] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_9 = memref.subview %subview_7[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> %34 = vector.transfer_read %subview_8[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type>, vector<6x3xf32> %35 = vector.transfer_read %subview_9[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type>, vector<6x1xf32> %36 = vector.shape_cast %35 : vector<6x1xf32> to vector<6xf32> %37 = vector.extract %arg7[0, 0] : vector<3xf32> from vector<1x1x3xf32> %38 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind} %34, %36, %37 : vector<6x3xf32>, vector<6xf32> into vector<3xf32> %39 = vector.broadcast %38 : vector<3xf32> to vector<1x1x3xf32> scf.yield %39 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> vector.transfer_write %32, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> } } } } return } // -----// IR Dump After CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_1 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref>{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { %subview = memref.subview %28[%arg0, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %subview_2 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_3 = memref.subview %30[0, 0, %arg1, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> scf.for %arg2 = %c0 to %c2 step %c1 { scf.for %arg3 = %c0 to %c9 step %c3 { %subview_4 = memref.subview %subview_3[%arg2, 0, 0, %arg3] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_5 = memref.subview %subview_2[%arg2, 0, 0, %arg3] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> vector.transfer_write %cst_0, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %31 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %arg5) -> (vector<1x1x3xf32>) { %subview_6 = memref.subview %subview_4[0, 0, %arg4, %arg6] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_7 = memref.subview %subview[0, 0, %arg4, %arg6] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_9 = memref.subview %subview_7[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> %34 = vector.transfer_read %subview_8[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type>, vector<6x3xf32> %35 = vector.transfer_read %subview_9[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type>, vector<6x1xf32> %36 = vector.shape_cast %35 : vector<6x1xf32> to vector<6xf32> %37 = vector.extract %arg7[0, 0] : vector<3xf32> from vector<1x1x3xf32> %38 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind} %34, %36, %37 : vector<6x3xf32>, vector<6xf32> into vector<3xf32> %39 = vector.broadcast %38 : vector<3xf32> to vector<1x1x3xf32> scf.yield %39 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> vector.transfer_write %32, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> } } } } return } // -----// IR Dump Before RemoveSingleIterationLoopPass (iree-codegen-remove-single-iteration-loop) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_1 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref>{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_count_y = hal.interface.workgroup.count[1] : index scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y { %subview = memref.subview %28[%arg0, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x { %subview_2 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_3 = memref.subview %30[0, 0, %arg1, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> scf.for %arg2 = %c0 to %c2 step %c1 { scf.for %arg3 = %c0 to %c9 step %c3 { %subview_4 = memref.subview %subview_3[%arg2, 0, 0, %arg3] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_5 = memref.subview %subview_2[%arg2, 0, 0, %arg3] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> vector.transfer_write %cst_0, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %31 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %arg5) -> (vector<1x1x3xf32>) { %subview_6 = memref.subview %subview_4[0, 0, %arg4, %arg6] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_7 = memref.subview %subview[0, 0, %arg4, %arg6] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_9 = memref.subview %subview_7[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> %34 = vector.transfer_read %subview_8[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type>, vector<6x3xf32> %35 = vector.transfer_read %subview_9[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type>, vector<6x1xf32> %36 = vector.shape_cast %35 : vector<6x1xf32> to vector<6xf32> %37 = vector.extract %arg7[0, 0] : vector<3xf32> from vector<1x1x3xf32> %38 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind} %34, %36, %37 : vector<6x3xf32>, vector<6xf32> into vector<3xf32> %39 = vector.broadcast %38 : vector<3xf32> to vector<1x1x3xf32> scf.yield %39 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> vector.transfer_write %32, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> } } } } return } // -----// IR Dump After RemoveSingleIterationLoopPass (iree-codegen-remove-single-iteration-loop) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_1 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref>{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_3 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_4 = memref.subview %subview_3[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_5 = memref.subview %subview_2[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> vector.transfer_write %cst_0, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_6 = memref.subview %subview_4[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_7 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_9 = memref.subview %subview_7[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> %34 = vector.transfer_read %subview_8[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type>, vector<6x3xf32> %35 = vector.transfer_read %subview_9[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type>, vector<6x1xf32> %36 = vector.shape_cast %35 : vector<6x1xf32> to vector<6xf32> %37 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %38 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind} %34, %36, %37 : vector<6x3xf32>, vector<6xf32> into vector<3xf32> %39 = vector.broadcast %38 : vector<3xf32> to vector<1x1x3xf32> scf.yield %39 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> vector.transfer_write %32, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> } } return } // -----// IR Dump Before LLVMCPUDropVectorUnitDimsPass (iree-llvmcpu-drop-vector-unit-dims) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_1 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref>{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_3 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_4 = memref.subview %subview_3[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_5 = memref.subview %subview_2[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> vector.transfer_write %cst_0, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_6 = memref.subview %subview_4[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_7 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_9 = memref.subview %subview_7[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> %34 = vector.transfer_read %subview_8[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type>, vector<6x3xf32> %35 = vector.transfer_read %subview_9[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type>, vector<6x1xf32> %36 = vector.shape_cast %35 : vector<6x1xf32> to vector<6xf32> %37 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %38 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind} %34, %36, %37 : vector<6x3xf32>, vector<6xf32> into vector<3xf32> %39 = vector.broadcast %38 : vector<3xf32> to vector<1x1x3xf32> scf.yield %39 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> vector.transfer_write %32, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> } } return } // -----// IR Dump After LLVMCPUDropVectorUnitDimsPass (iree-llvmcpu-drop-vector-unit-dims) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_0 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref>{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_1 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_3 = memref.subview %subview_2[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_4 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_6 = memref.subview %subview_3[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_7 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_9 = memref.subview %subview_7[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type> %34 = vector.transfer_read %subview_10[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<6x3xf32> %subview_11 = memref.subview %subview_9[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %35 = vector.transfer_read %subview_11[%c0], %cst_0 {in_bounds = [true]} : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type>, vector<6xf32> %36 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %37 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind} %34, %35, %36 : vector<6x3xf32>, vector<6xf32> into vector<3xf32> %38 = vector.broadcast %37 : vector<3xf32> to vector<1x1x3xf32> scf.yield %38 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type> vector.transfer_write %32, %subview_5[%c0] {in_bounds = [true]} : vector<3xf32>, memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type> } } return } // -----// IR Dump Before LLVMCPUVirtualVectorLoweringPass (iree-llvmcpu-virtual-vector-lowering) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_0 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref>{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_1 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_3 = memref.subview %subview_2[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_4 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_6 = memref.subview %subview_3[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_7 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_9 = memref.subview %subview_7[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type> %34 = vector.transfer_read %subview_10[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<6x3xf32> %subview_11 = memref.subview %subview_9[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %35 = vector.transfer_read %subview_11[%c0], %cst_0 {in_bounds = [true]} : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type>, vector<6xf32> %36 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %37 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind} %34, %35, %36 : vector<6x3xf32>, vector<6xf32> into vector<3xf32> %38 = vector.broadcast %37 : vector<3xf32> to vector<1x1x3xf32> scf.yield %38 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type> vector.transfer_write %32, %subview_5[%c0] {in_bounds = [true]} : vector<3xf32>, memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type> } } return } // -----// IR Dump After LLVMCPUVirtualVectorLoweringPass (iree-llvmcpu-virtual-vector-lowering) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_0 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref>{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_1 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_3 = memref.subview %subview_2[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_4 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_6 = memref.subview %subview_3[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_7 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_9 = memref.subview %subview_7[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type> %34 = vector.transfer_read %subview_10[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<6x3xf32> %subview_11 = memref.subview %subview_9[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %35 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %36 = vector.extract %34[0] : vector<3xf32> from vector<6x3xf32> %37 = memref.load %subview_11[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %38 = vector.broadcast %37 : f32 to vector<3xf32> %39 = vector.fma %36, %38, %35 : vector<3xf32> %40 = vector.extract %34[1] : vector<3xf32> from vector<6x3xf32> %41 = memref.load %subview_11[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %40, %42, %39 : vector<3xf32> %44 = vector.extract %34[2] : vector<3xf32> from vector<6x3xf32> %45 = memref.load %subview_11[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %46 = vector.broadcast %45 : f32 to vector<3xf32> %47 = vector.fma %44, %46, %43 : vector<3xf32> %48 = vector.extract %34[3] : vector<3xf32> from vector<6x3xf32> %49 = memref.load %subview_11[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %50 = vector.broadcast %49 : f32 to vector<3xf32> %51 = vector.fma %48, %50, %47 : vector<3xf32> %52 = vector.extract %34[4] : vector<3xf32> from vector<6x3xf32> %53 = memref.load %subview_11[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %52, %54, %51 : vector<3xf32> %56 = vector.extract %34[5] : vector<3xf32> from vector<6x3xf32> %57 = memref.load %subview_11[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %58 = vector.broadcast %57 : f32 to vector<3xf32> %59 = vector.fma %56, %58, %55 : vector<3xf32> %60 = vector.broadcast %59 : vector<3xf32> to vector<1x1x3xf32> scf.yield %60 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type> vector.transfer_write %32, %subview_5[%c0] {in_bounds = [true]} : vector<3xf32>, memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type> } } return } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_0 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref>{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_1 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_3 = memref.subview %subview_2[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_4 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_6 = memref.subview %subview_3[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_7 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_9 = memref.subview %subview_7[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type> %34 = vector.transfer_read %subview_10[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<6x3xf32> %subview_11 = memref.subview %subview_9[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %35 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %36 = vector.extract %34[0] : vector<3xf32> from vector<6x3xf32> %37 = memref.load %subview_11[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %38 = vector.broadcast %37 : f32 to vector<3xf32> %39 = vector.fma %36, %38, %35 : vector<3xf32> %40 = vector.extract %34[1] : vector<3xf32> from vector<6x3xf32> %41 = memref.load %subview_11[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %40, %42, %39 : vector<3xf32> %44 = vector.extract %34[2] : vector<3xf32> from vector<6x3xf32> %45 = memref.load %subview_11[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %46 = vector.broadcast %45 : f32 to vector<3xf32> %47 = vector.fma %44, %46, %43 : vector<3xf32> %48 = vector.extract %34[3] : vector<3xf32> from vector<6x3xf32> %49 = memref.load %subview_11[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %50 = vector.broadcast %49 : f32 to vector<3xf32> %51 = vector.fma %48, %50, %47 : vector<3xf32> %52 = vector.extract %34[4] : vector<3xf32> from vector<6x3xf32> %53 = memref.load %subview_11[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %52, %54, %51 : vector<3xf32> %56 = vector.extract %34[5] : vector<3xf32> from vector<6x3xf32> %57 = memref.load %subview_11[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %58 = vector.broadcast %57 : f32 to vector<3xf32> %59 = vector.fma %56, %58, %55 : vector<3xf32> %60 = vector.broadcast %59 : vector<3xf32> to vector<1x1x3xf32> scf.yield %60 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type> vector.transfer_write %32, %subview_5[%c0] {in_bounds = [true]} : vector<3xf32>, memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type> } } return } // -----// IR Dump After Canonicalizer (canonicalize) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_0 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref>{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_1 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_3 = memref.subview %subview_2[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_4 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_6 = memref.subview %subview_3[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_7 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_9 = memref.subview %subview_7[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type> %34 = vector.transfer_read %subview_10[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<6x3xf32> %subview_11 = memref.subview %subview_9[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %35 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %36 = vector.extract %34[0] : vector<3xf32> from vector<6x3xf32> %37 = memref.load %subview_11[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %38 = vector.broadcast %37 : f32 to vector<3xf32> %39 = vector.fma %36, %38, %35 : vector<3xf32> %40 = vector.extract %34[1] : vector<3xf32> from vector<6x3xf32> %41 = memref.load %subview_11[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %40, %42, %39 : vector<3xf32> %44 = vector.extract %34[2] : vector<3xf32> from vector<6x3xf32> %45 = memref.load %subview_11[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %46 = vector.broadcast %45 : f32 to vector<3xf32> %47 = vector.fma %44, %46, %43 : vector<3xf32> %48 = vector.extract %34[3] : vector<3xf32> from vector<6x3xf32> %49 = memref.load %subview_11[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %50 = vector.broadcast %49 : f32 to vector<3xf32> %51 = vector.fma %48, %50, %47 : vector<3xf32> %52 = vector.extract %34[4] : vector<3xf32> from vector<6x3xf32> %53 = memref.load %subview_11[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %52, %54, %51 : vector<3xf32> %56 = vector.extract %34[5] : vector<3xf32> from vector<6x3xf32> %57 = memref.load %subview_11[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %58 = vector.broadcast %57 : f32 to vector<3xf32> %59 = vector.fma %56, %58, %55 : vector<3xf32> %60 = vector.broadcast %59 : vector<3xf32> to vector<1x1x3xf32> scf.yield %60 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type> vector.transfer_write %32, %subview_5[%c0] {in_bounds = [true]} : vector<3xf32>, memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type> } } return } // -----// IR Dump Before LLVMCPUVectorTransferLoweringPass (iree-llvmcpu-vector-transfer-lowering) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst_0 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref>{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_1 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_3 = memref.subview %subview_2[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_4 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_6 = memref.subview %subview_3[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_7 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_9 = memref.subview %subview_7[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type> %34 = vector.transfer_read %subview_10[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<6x3xf32> %subview_11 = memref.subview %subview_9[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %35 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %36 = vector.extract %34[0] : vector<3xf32> from vector<6x3xf32> %37 = memref.load %subview_11[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %38 = vector.broadcast %37 : f32 to vector<3xf32> %39 = vector.fma %36, %38, %35 : vector<3xf32> %40 = vector.extract %34[1] : vector<3xf32> from vector<6x3xf32> %41 = memref.load %subview_11[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %40, %42, %39 : vector<3xf32> %44 = vector.extract %34[2] : vector<3xf32> from vector<6x3xf32> %45 = memref.load %subview_11[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %46 = vector.broadcast %45 : f32 to vector<3xf32> %47 = vector.fma %44, %46, %43 : vector<3xf32> %48 = vector.extract %34[3] : vector<3xf32> from vector<6x3xf32> %49 = memref.load %subview_11[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %50 = vector.broadcast %49 : f32 to vector<3xf32> %51 = vector.fma %48, %50, %47 : vector<3xf32> %52 = vector.extract %34[4] : vector<3xf32> from vector<6x3xf32> %53 = memref.load %subview_11[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %52, %54, %51 : vector<3xf32> %56 = vector.extract %34[5] : vector<3xf32> from vector<6x3xf32> %57 = memref.load %subview_11[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %58 = vector.broadcast %57 : f32 to vector<3xf32> %59 = vector.fma %56, %58, %55 : vector<3xf32> %60 = vector.broadcast %59 : vector<3xf32> to vector<1x1x3xf32> scf.yield %60 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type> vector.transfer_write %32, %subview_5[%c0] {in_bounds = [true]} : vector<3xf32>, memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type> } } return } // -----// IR Dump After LLVMCPUVectorTransferLoweringPass (iree-llvmcpu-vector-transfer-lowering) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref>{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type>, vector<3xf32> } } return } // -----// IR Dump Before LLVMCPUVectorTransposeLoweringPass (iree-llvmcpu-vector-transpose-lowering) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref>{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type>, vector<3xf32> } } return } // -----// IR Dump After LLVMCPUVectorTransposeLoweringPass (iree-llvmcpu-vector-transpose-lowering) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref>{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type>, vector<3xf32> } } return } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref>{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type>, vector<3xf32> } } return } // -----// IR Dump After Canonicalizer (canonicalize) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref>{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type>, vector<3xf32> } } return } // -----// IR Dump Before LLVMCPUVectorShapeCastLoweringPass (iree-llvmcpu-vector-shape-cast-lowering) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref>{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type>, vector<3xf32> } } return } // -----// IR Dump After LLVMCPUVectorShapeCastLoweringPass (iree-llvmcpu-vector-shape-cast-lowering) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref>{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type>, vector<3xf32> } } return } // -----// IR Dump After LLVMCPULowerExecutableTargetPass (iree-llvmcpu-lower-executable-target) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref>{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type>, vector<3xf32> } } return } // -----// IR Dump Before EraseHALDescriptorTypeFromMemRefPass (iree-codegen-erase-hal-descriptor-type-from-memref) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref>{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type>, vector<3xf32> } } return } // -----// IR Dump After EraseHALDescriptorTypeFromMemRefPass (iree-codegen-erase-hal-descriptor-type-from-memref) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } // -----// IR Dump Before LowerUKernelOpsToCallsPass (iree-codegen-lower-ukernel-ops-to-calls) //----- // module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } } // -----// IR Dump After LowerUKernelOpsToCallsPass (iree-codegen-lower-ukernel-ops-to-calls) //----- // module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } } // -----// IR Dump Before LinalgExtToLoopsPass (iree-linalg-ext-to-loops) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } // -----// IR Dump After LinalgExtToLoopsPass (iree-linalg-ext-to-loops) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } // -----// IR Dump Before MemrefCopyToLinalgPass (iree-codegen-memrefcopy-to-linalg) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } // -----// IR Dump After MemrefCopyToLinalgPass (iree-codegen-memrefcopy-to-linalg) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } // -----// IR Dump Before ConvertLinalgToLoopsPass (convert-linalg-to-loops) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } // -----// IR Dump After ConvertLinalgToLoopsPass (convert-linalg-to-loops) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } // -----// IR Dump Before ConvertBf16ArithToF32Pass (iree-convert-bf16-arith-to-f32) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } // -----// IR Dump After ConvertBf16ArithToF32Pass (iree-convert-bf16-arith-to-f32) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } // -----// IR Dump Before ConvertBf16ToUInt16BuffersPass (iree-codegen-convert-bf16-to-uint16-buffers) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } // -----// IR Dump After ConvertBf16ToUInt16BuffersPass (iree-codegen-convert-bf16-to-uint16-buffers) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } // -----// IR Dump After Canonicalizer (canonicalize) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } // -----// IR Dump Before CSE (cse) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } // -----// IR Dump After CSE (cse) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } // -----// IR Dump Before OneShotBufferize (one-shot-bufferize) //----- // module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } } // -----// IR Dump After OneShotBufferize (one-shot-bufferize) //----- // module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } } // -----// IR Dump Before FoldTensorExtractOpPass (iree-codegen-fold-tensor-extract-op) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } // -----// IR Dump After FoldTensorExtractOpPass (iree-codegen-fold-tensor-extract-op) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } // -----// IR Dump Before ConvertComplexToStandard (convert-complex-to-standard) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } // -----// IR Dump After ConvertComplexToStandard (convert-complex-to-standard) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } // -----// IR Dump Before PolynomialApproximationPass (iree-codegen-polynomial-approximation) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } // -----// IR Dump After PolynomialApproximationPass (iree-codegen-polynomial-approximation) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } // -----// IR Dump Before HoistStaticallyBoundAllocationsPass (iree-codegen-hoist-statically-bound-allocations) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } // -----// IR Dump After HoistStaticallyBoundAllocationsPass (iree-codegen-hoist-statically-bound-allocations) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } // -----// IR Dump Before IREEExpandStridedMetadataPass (iree-codegen-expand-strided-metadata) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } // -----// IR Dump After IREEExpandStridedMetadataPass (iree-codegen-expand-strided-metadata) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } // -----// IR Dump Before CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } // -----// IR Dump After CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } // -----// IR Dump Before LLVMCPUCheckIRBeforeLLVMConversionPass (iree-llvmcpu-check-ir-before-llvm-conversion) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } // -----// IR Dump After LLVMCPUCheckIRBeforeLLVMConversionPass (iree-llvmcpu-check-ir-before-llvm-conversion) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } // -----// IR Dump Before SCFToControlFlow (convert-scf-to-cf) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> scf.for %arg0 = %c0 to %c2 step %c1 { scf.for %arg1 = %c0 to %c9 step %c3 { %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) { %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) { %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32> %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %42 = vector.broadcast %41 : f32 to vector<3xf32> %43 = vector.fma %34, %42, %40 : vector<3xf32> %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %45 = vector.broadcast %44 : f32 to vector<3xf32> %46 = vector.fma %35, %45, %43 : vector<3xf32> %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %48 = vector.broadcast %47 : f32 to vector<3xf32> %49 = vector.fma %36, %48, %46 : vector<3xf32> %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %51 = vector.broadcast %50 : f32 to vector<3xf32> %52 = vector.fma %37, %51, %49 : vector<3xf32> %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %54 = vector.broadcast %53 : f32 to vector<3xf32> %55 = vector.fma %38, %54, %52 : vector<3xf32> %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %57 = vector.broadcast %56 : f32 to vector<3xf32> %58 = vector.fma %39, %57, %55 : vector<3xf32> %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32> scf.yield %59 : vector<1x1x3xf32> } scf.yield %33 : vector<1x1x3xf32> } %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> } } return } // -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> cf.br ^bb1(%c0 : index) ^bb1(%31: index): // 2 preds: ^bb0, ^bb11 %32 = arith.cmpi slt, %31, %c2 : index cf.cond_br %32, ^bb2, ^bb12 ^bb2: // pred: ^bb1 cf.br ^bb3(%c0 : index) ^bb3(%33: index): // 2 preds: ^bb2, ^bb10 %34 = arith.cmpi slt, %33, %c9 : index cf.cond_br %34, ^bb4, ^bb11 ^bb4: // pred: ^bb3 %subview_2 = memref.subview %subview_1[%31, 0, 0, %33] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%31, 0, 0, %33] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> cf.br ^bb5(%c0, %cst : index, vector<1x1x3xf32>) ^bb5(%35: index, %36: vector<1x1x3xf32>): // 2 preds: ^bb4, ^bb9 %37 = arith.cmpi slt, %35, %c5 : index cf.cond_br %37, ^bb6, ^bb10 ^bb6: // pred: ^bb5 cf.br ^bb7(%c0, %36 : index, vector<1x1x3xf32>) ^bb7(%38: index, %39: vector<1x1x3xf32>): // 2 preds: ^bb6, ^bb8 %40 = arith.cmpi slt, %38, %c5 : index cf.cond_br %40, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %subview_4 = memref.subview %subview_2[0, 0, %35, %38] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_5 = memref.subview %subview[0, 0, %35, %38] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_6 = memref.subview %subview_4[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %41 = vector.load %subview_8[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %42 = vector.load %subview_8[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %43 = vector.load %subview_8[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %44 = vector.load %subview_8[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %45 = vector.load %subview_8[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %46 = vector.load %subview_8[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %47 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32> %48 = memref.load %subview_9[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %49 = vector.broadcast %48 : f32 to vector<3xf32> %50 = vector.fma %41, %49, %47 : vector<3xf32> %51 = memref.load %subview_9[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %52 = vector.broadcast %51 : f32 to vector<3xf32> %53 = vector.fma %42, %52, %50 : vector<3xf32> %54 = memref.load %subview_9[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %55 = vector.broadcast %54 : f32 to vector<3xf32> %56 = vector.fma %43, %55, %53 : vector<3xf32> %57 = memref.load %subview_9[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %58 = vector.broadcast %57 : f32 to vector<3xf32> %59 = vector.fma %44, %58, %56 : vector<3xf32> %60 = memref.load %subview_9[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %61 = vector.broadcast %60 : f32 to vector<3xf32> %62 = vector.fma %45, %61, %59 : vector<3xf32> %63 = memref.load %subview_9[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %64 = vector.broadcast %63 : f32 to vector<3xf32> %65 = vector.fma %46, %64, %62 : vector<3xf32> %66 = vector.broadcast %65 : vector<3xf32> to vector<1x1x3xf32> %67 = arith.addi %38, %c1 : index cf.br ^bb7(%67, %66 : index, vector<1x1x3xf32>) ^bb9: // pred: ^bb7 %68 = arith.addi %35, %c1 : index cf.br ^bb5(%68, %39 : index, vector<1x1x3xf32>) ^bb10: // pred: ^bb5 %69 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_10 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %69, %subview_10[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> %70 = arith.addi %33, %c3 : index cf.br ^bb3(%70 : index) ^bb11: // pred: ^bb3 %71 = arith.addi %31, %c1 : index cf.br ^bb1(%71 : index) ^bb12: // pred: ^bb1 return } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> cf.br ^bb1(%c0 : index) ^bb1(%31: index): // 2 preds: ^bb0, ^bb11 %32 = arith.cmpi slt, %31, %c2 : index cf.cond_br %32, ^bb2, ^bb12 ^bb2: // pred: ^bb1 cf.br ^bb3(%c0 : index) ^bb3(%33: index): // 2 preds: ^bb2, ^bb10 %34 = arith.cmpi slt, %33, %c9 : index cf.cond_br %34, ^bb4, ^bb11 ^bb4: // pred: ^bb3 %subview_2 = memref.subview %subview_1[%31, 0, 0, %33] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%31, 0, 0, %33] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> cf.br ^bb5(%c0, %cst : index, vector<1x1x3xf32>) ^bb5(%35: index, %36: vector<1x1x3xf32>): // 2 preds: ^bb4, ^bb9 %37 = arith.cmpi slt, %35, %c5 : index cf.cond_br %37, ^bb6, ^bb10 ^bb6: // pred: ^bb5 cf.br ^bb7(%c0, %36 : index, vector<1x1x3xf32>) ^bb7(%38: index, %39: vector<1x1x3xf32>): // 2 preds: ^bb6, ^bb8 %40 = arith.cmpi slt, %38, %c5 : index cf.cond_br %40, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %subview_4 = memref.subview %subview_2[0, 0, %35, %38] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_5 = memref.subview %subview[0, 0, %35, %38] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_6 = memref.subview %subview_4[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %41 = vector.load %subview_8[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %42 = vector.load %subview_8[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %43 = vector.load %subview_8[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %44 = vector.load %subview_8[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %45 = vector.load %subview_8[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %46 = vector.load %subview_8[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %47 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32> %48 = memref.load %subview_9[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %49 = vector.broadcast %48 : f32 to vector<3xf32> %50 = vector.fma %41, %49, %47 : vector<3xf32> %51 = memref.load %subview_9[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %52 = vector.broadcast %51 : f32 to vector<3xf32> %53 = vector.fma %42, %52, %50 : vector<3xf32> %54 = memref.load %subview_9[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %55 = vector.broadcast %54 : f32 to vector<3xf32> %56 = vector.fma %43, %55, %53 : vector<3xf32> %57 = memref.load %subview_9[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %58 = vector.broadcast %57 : f32 to vector<3xf32> %59 = vector.fma %44, %58, %56 : vector<3xf32> %60 = memref.load %subview_9[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %61 = vector.broadcast %60 : f32 to vector<3xf32> %62 = vector.fma %45, %61, %59 : vector<3xf32> %63 = memref.load %subview_9[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %64 = vector.broadcast %63 : f32 to vector<3xf32> %65 = vector.fma %46, %64, %62 : vector<3xf32> %66 = vector.broadcast %65 : vector<3xf32> to vector<1x1x3xf32> %67 = arith.addi %38, %c1 : index cf.br ^bb7(%67, %66 : index, vector<1x1x3xf32>) ^bb9: // pred: ^bb7 %68 = arith.addi %35, %c1 : index cf.br ^bb5(%68, %39 : index, vector<1x1x3xf32>) ^bb10: // pred: ^bb5 %69 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_10 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %69, %subview_10[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> %70 = arith.addi %33, %c3 : index cf.br ^bb3(%70 : index) ^bb11: // pred: ^bb3 %71 = arith.addi %31, %c1 : index cf.br ^bb1(%71 : index) ^bb12: // pred: ^bb1 return } // -----// IR Dump After Canonicalizer (canonicalize) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> cf.br ^bb1(%c0 : index) ^bb1(%31: index): // 2 preds: ^bb0, ^bb9 %32 = arith.cmpi slt, %31, %c2 : index cf.cond_br %32, ^bb2(%c0 : index), ^bb10 ^bb2(%33: index): // 2 preds: ^bb1, ^bb8 %34 = arith.cmpi slt, %33, %c9 : index cf.cond_br %34, ^bb3, ^bb9 ^bb3: // pred: ^bb2 %subview_2 = memref.subview %subview_1[%31, 0, 0, %33] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%31, 0, 0, %33] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> cf.br ^bb4(%c0, %cst : index, vector<1x1x3xf32>) ^bb4(%35: index, %36: vector<1x1x3xf32>): // 2 preds: ^bb3, ^bb7 %37 = arith.cmpi slt, %35, %c5 : index cf.cond_br %37, ^bb5(%c0, %36 : index, vector<1x1x3xf32>), ^bb8 ^bb5(%38: index, %39: vector<1x1x3xf32>): // 2 preds: ^bb4, ^bb6 %40 = arith.cmpi slt, %38, %c5 : index cf.cond_br %40, ^bb6, ^bb7 ^bb6: // pred: ^bb5 %subview_4 = memref.subview %subview_2[0, 0, %35, %38] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_5 = memref.subview %subview[0, 0, %35, %38] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_6 = memref.subview %subview_4[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %41 = vector.load %subview_8[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %42 = vector.load %subview_8[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %43 = vector.load %subview_8[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %44 = vector.load %subview_8[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %45 = vector.load %subview_8[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %46 = vector.load %subview_8[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %47 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32> %48 = memref.load %subview_9[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %49 = vector.broadcast %48 : f32 to vector<3xf32> %50 = vector.fma %41, %49, %47 : vector<3xf32> %51 = memref.load %subview_9[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %52 = vector.broadcast %51 : f32 to vector<3xf32> %53 = vector.fma %42, %52, %50 : vector<3xf32> %54 = memref.load %subview_9[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %55 = vector.broadcast %54 : f32 to vector<3xf32> %56 = vector.fma %43, %55, %53 : vector<3xf32> %57 = memref.load %subview_9[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %58 = vector.broadcast %57 : f32 to vector<3xf32> %59 = vector.fma %44, %58, %56 : vector<3xf32> %60 = memref.load %subview_9[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %61 = vector.broadcast %60 : f32 to vector<3xf32> %62 = vector.fma %45, %61, %59 : vector<3xf32> %63 = memref.load %subview_9[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %64 = vector.broadcast %63 : f32 to vector<3xf32> %65 = vector.fma %46, %64, %62 : vector<3xf32> %66 = vector.broadcast %65 : vector<3xf32> to vector<1x1x3xf32> %67 = arith.addi %38, %c1 : index cf.br ^bb5(%67, %66 : index, vector<1x1x3xf32>) ^bb7: // pred: ^bb5 %68 = arith.addi %35, %c1 : index cf.br ^bb4(%68, %39 : index, vector<1x1x3xf32>) ^bb8: // pred: ^bb4 %69 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_10 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %69, %subview_10[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> %70 = arith.addi %33, %c3 : index cf.br ^bb2(%70 : index) ^bb9: // pred: ^bb2 %71 = arith.addi %31, %c1 : index cf.br ^bb1(%71 : index) ^bb10: // pred: ^bb1 return } // -----// IR Dump Before CSE (cse) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> cf.br ^bb1(%c0 : index) ^bb1(%31: index): // 2 preds: ^bb0, ^bb9 %32 = arith.cmpi slt, %31, %c2 : index cf.cond_br %32, ^bb2(%c0 : index), ^bb10 ^bb2(%33: index): // 2 preds: ^bb1, ^bb8 %34 = arith.cmpi slt, %33, %c9 : index cf.cond_br %34, ^bb3, ^bb9 ^bb3: // pred: ^bb2 %subview_2 = memref.subview %subview_1[%31, 0, 0, %33] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%31, 0, 0, %33] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> cf.br ^bb4(%c0, %cst : index, vector<1x1x3xf32>) ^bb4(%35: index, %36: vector<1x1x3xf32>): // 2 preds: ^bb3, ^bb7 %37 = arith.cmpi slt, %35, %c5 : index cf.cond_br %37, ^bb5(%c0, %36 : index, vector<1x1x3xf32>), ^bb8 ^bb5(%38: index, %39: vector<1x1x3xf32>): // 2 preds: ^bb4, ^bb6 %40 = arith.cmpi slt, %38, %c5 : index cf.cond_br %40, ^bb6, ^bb7 ^bb6: // pred: ^bb5 %subview_4 = memref.subview %subview_2[0, 0, %35, %38] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_5 = memref.subview %subview[0, 0, %35, %38] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_6 = memref.subview %subview_4[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %41 = vector.load %subview_8[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %42 = vector.load %subview_8[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %43 = vector.load %subview_8[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %44 = vector.load %subview_8[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %45 = vector.load %subview_8[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %46 = vector.load %subview_8[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %47 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32> %48 = memref.load %subview_9[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %49 = vector.broadcast %48 : f32 to vector<3xf32> %50 = vector.fma %41, %49, %47 : vector<3xf32> %51 = memref.load %subview_9[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %52 = vector.broadcast %51 : f32 to vector<3xf32> %53 = vector.fma %42, %52, %50 : vector<3xf32> %54 = memref.load %subview_9[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %55 = vector.broadcast %54 : f32 to vector<3xf32> %56 = vector.fma %43, %55, %53 : vector<3xf32> %57 = memref.load %subview_9[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %58 = vector.broadcast %57 : f32 to vector<3xf32> %59 = vector.fma %44, %58, %56 : vector<3xf32> %60 = memref.load %subview_9[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %61 = vector.broadcast %60 : f32 to vector<3xf32> %62 = vector.fma %45, %61, %59 : vector<3xf32> %63 = memref.load %subview_9[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %64 = vector.broadcast %63 : f32 to vector<3xf32> %65 = vector.fma %46, %64, %62 : vector<3xf32> %66 = vector.broadcast %65 : vector<3xf32> to vector<1x1x3xf32> %67 = arith.addi %38, %c1 : index cf.br ^bb5(%67, %66 : index, vector<1x1x3xf32>) ^bb7: // pred: ^bb5 %68 = arith.addi %35, %c1 : index cf.br ^bb4(%68, %39 : index, vector<1x1x3xf32>) ^bb8: // pred: ^bb4 %69 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_10 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %69, %subview_10[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> %70 = arith.addi %33, %c3 : index cf.br ^bb2(%70 : index) ^bb9: // pred: ^bb2 %71 = arith.addi %31, %c1 : index cf.br ^bb1(%71 : index) ^bb10: // pred: ^bb1 return } // -----// IR Dump After CSE (cse) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> cf.br ^bb1(%c0 : index) ^bb1(%31: index): // 2 preds: ^bb0, ^bb9 %32 = arith.cmpi slt, %31, %c2 : index cf.cond_br %32, ^bb2(%c0 : index), ^bb10 ^bb2(%33: index): // 2 preds: ^bb1, ^bb8 %34 = arith.cmpi slt, %33, %c9 : index cf.cond_br %34, ^bb3, ^bb9 ^bb3: // pred: ^bb2 %subview_2 = memref.subview %subview_1[%31, 0, 0, %33] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%31, 0, 0, %33] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> cf.br ^bb4(%c0, %cst : index, vector<1x1x3xf32>) ^bb4(%35: index, %36: vector<1x1x3xf32>): // 2 preds: ^bb3, ^bb7 %37 = arith.cmpi slt, %35, %c5 : index cf.cond_br %37, ^bb5(%c0, %36 : index, vector<1x1x3xf32>), ^bb8 ^bb5(%38: index, %39: vector<1x1x3xf32>): // 2 preds: ^bb4, ^bb6 %40 = arith.cmpi slt, %38, %c5 : index cf.cond_br %40, ^bb6, ^bb7 ^bb6: // pred: ^bb5 %subview_4 = memref.subview %subview_2[0, 0, %35, %38] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_5 = memref.subview %subview[0, 0, %35, %38] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_6 = memref.subview %subview_4[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %41 = vector.load %subview_8[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %42 = vector.load %subview_8[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %43 = vector.load %subview_8[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %44 = vector.load %subview_8[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %45 = vector.load %subview_8[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %46 = vector.load %subview_8[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %47 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32> %48 = memref.load %subview_9[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %49 = vector.broadcast %48 : f32 to vector<3xf32> %50 = vector.fma %41, %49, %47 : vector<3xf32> %51 = memref.load %subview_9[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %52 = vector.broadcast %51 : f32 to vector<3xf32> %53 = vector.fma %42, %52, %50 : vector<3xf32> %54 = memref.load %subview_9[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %55 = vector.broadcast %54 : f32 to vector<3xf32> %56 = vector.fma %43, %55, %53 : vector<3xf32> %57 = memref.load %subview_9[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %58 = vector.broadcast %57 : f32 to vector<3xf32> %59 = vector.fma %44, %58, %56 : vector<3xf32> %60 = memref.load %subview_9[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %61 = vector.broadcast %60 : f32 to vector<3xf32> %62 = vector.fma %45, %61, %59 : vector<3xf32> %63 = memref.load %subview_9[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %64 = vector.broadcast %63 : f32 to vector<3xf32> %65 = vector.fma %46, %64, %62 : vector<3xf32> %66 = vector.broadcast %65 : vector<3xf32> to vector<1x1x3xf32> %67 = arith.addi %38, %c1 : index cf.br ^bb5(%67, %66 : index, vector<1x1x3xf32>) ^bb7: // pred: ^bb5 %68 = arith.addi %35, %c1 : index cf.br ^bb4(%68, %39 : index, vector<1x1x3xf32>) ^bb8: // pred: ^bb4 %69 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_10 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %69, %subview_10[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> %70 = arith.addi %33, %c3 : index cf.br ^bb2(%70 : index) ^bb9: // pred: ^bb2 %71 = arith.addi %31, %c1 : index cf.br ^bb1(%71 : index) ^bb10: // pred: ^bb1 return } // -----// IR Dump Before ArithExpandOpsPass (arith-expand) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> cf.br ^bb1(%c0 : index) ^bb1(%31: index): // 2 preds: ^bb0, ^bb9 %32 = arith.cmpi slt, %31, %c2 : index cf.cond_br %32, ^bb2(%c0 : index), ^bb10 ^bb2(%33: index): // 2 preds: ^bb1, ^bb8 %34 = arith.cmpi slt, %33, %c9 : index cf.cond_br %34, ^bb3, ^bb9 ^bb3: // pred: ^bb2 %subview_2 = memref.subview %subview_1[%31, 0, 0, %33] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%31, 0, 0, %33] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> cf.br ^bb4(%c0, %cst : index, vector<1x1x3xf32>) ^bb4(%35: index, %36: vector<1x1x3xf32>): // 2 preds: ^bb3, ^bb7 %37 = arith.cmpi slt, %35, %c5 : index cf.cond_br %37, ^bb5(%c0, %36 : index, vector<1x1x3xf32>), ^bb8 ^bb5(%38: index, %39: vector<1x1x3xf32>): // 2 preds: ^bb4, ^bb6 %40 = arith.cmpi slt, %38, %c5 : index cf.cond_br %40, ^bb6, ^bb7 ^bb6: // pred: ^bb5 %subview_4 = memref.subview %subview_2[0, 0, %35, %38] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_5 = memref.subview %subview[0, 0, %35, %38] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_6 = memref.subview %subview_4[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %41 = vector.load %subview_8[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %42 = vector.load %subview_8[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %43 = vector.load %subview_8[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %44 = vector.load %subview_8[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %45 = vector.load %subview_8[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %46 = vector.load %subview_8[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %47 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32> %48 = memref.load %subview_9[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %49 = vector.broadcast %48 : f32 to vector<3xf32> %50 = vector.fma %41, %49, %47 : vector<3xf32> %51 = memref.load %subview_9[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %52 = vector.broadcast %51 : f32 to vector<3xf32> %53 = vector.fma %42, %52, %50 : vector<3xf32> %54 = memref.load %subview_9[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %55 = vector.broadcast %54 : f32 to vector<3xf32> %56 = vector.fma %43, %55, %53 : vector<3xf32> %57 = memref.load %subview_9[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %58 = vector.broadcast %57 : f32 to vector<3xf32> %59 = vector.fma %44, %58, %56 : vector<3xf32> %60 = memref.load %subview_9[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %61 = vector.broadcast %60 : f32 to vector<3xf32> %62 = vector.fma %45, %61, %59 : vector<3xf32> %63 = memref.load %subview_9[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %64 = vector.broadcast %63 : f32 to vector<3xf32> %65 = vector.fma %46, %64, %62 : vector<3xf32> %66 = vector.broadcast %65 : vector<3xf32> to vector<1x1x3xf32> %67 = arith.addi %38, %c1 : index cf.br ^bb5(%67, %66 : index, vector<1x1x3xf32>) ^bb7: // pred: ^bb5 %68 = arith.addi %35, %c1 : index cf.br ^bb4(%68, %39 : index, vector<1x1x3xf32>) ^bb8: // pred: ^bb4 %69 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_10 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %69, %subview_10[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> %70 = arith.addi %33, %c3 : index cf.br ^bb2(%70 : index) ^bb9: // pred: ^bb2 %71 = arith.addi %31, %c1 : index cf.br ^bb1(%71 : index) ^bb10: // pred: ^bb1 return } // -----// IR Dump After ArithExpandOpsPass (arith-expand) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> cf.br ^bb1(%c0 : index) ^bb1(%31: index): // 2 preds: ^bb0, ^bb9 %32 = arith.cmpi slt, %31, %c2 : index cf.cond_br %32, ^bb2(%c0 : index), ^bb10 ^bb2(%33: index): // 2 preds: ^bb1, ^bb8 %34 = arith.cmpi slt, %33, %c9 : index cf.cond_br %34, ^bb3, ^bb9 ^bb3: // pred: ^bb2 %subview_2 = memref.subview %subview_1[%31, 0, 0, %33] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%31, 0, 0, %33] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> cf.br ^bb4(%c0, %cst : index, vector<1x1x3xf32>) ^bb4(%35: index, %36: vector<1x1x3xf32>): // 2 preds: ^bb3, ^bb7 %37 = arith.cmpi slt, %35, %c5 : index cf.cond_br %37, ^bb5(%c0, %36 : index, vector<1x1x3xf32>), ^bb8 ^bb5(%38: index, %39: vector<1x1x3xf32>): // 2 preds: ^bb4, ^bb6 %40 = arith.cmpi slt, %38, %c5 : index cf.cond_br %40, ^bb6, ^bb7 ^bb6: // pred: ^bb5 %subview_4 = memref.subview %subview_2[0, 0, %35, %38] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_5 = memref.subview %subview[0, 0, %35, %38] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_6 = memref.subview %subview_4[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %41 = vector.load %subview_8[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %42 = vector.load %subview_8[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %43 = vector.load %subview_8[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %44 = vector.load %subview_8[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %45 = vector.load %subview_8[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %46 = vector.load %subview_8[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %47 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32> %48 = memref.load %subview_9[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %49 = vector.broadcast %48 : f32 to vector<3xf32> %50 = vector.fma %41, %49, %47 : vector<3xf32> %51 = memref.load %subview_9[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %52 = vector.broadcast %51 : f32 to vector<3xf32> %53 = vector.fma %42, %52, %50 : vector<3xf32> %54 = memref.load %subview_9[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %55 = vector.broadcast %54 : f32 to vector<3xf32> %56 = vector.fma %43, %55, %53 : vector<3xf32> %57 = memref.load %subview_9[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %58 = vector.broadcast %57 : f32 to vector<3xf32> %59 = vector.fma %44, %58, %56 : vector<3xf32> %60 = memref.load %subview_9[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %61 = vector.broadcast %60 : f32 to vector<3xf32> %62 = vector.fma %45, %61, %59 : vector<3xf32> %63 = memref.load %subview_9[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %64 = vector.broadcast %63 : f32 to vector<3xf32> %65 = vector.fma %46, %64, %62 : vector<3xf32> %66 = vector.broadcast %65 : vector<3xf32> to vector<1x1x3xf32> %67 = arith.addi %38, %c1 : index cf.br ^bb5(%67, %66 : index, vector<1x1x3xf32>) ^bb7: // pred: ^bb5 %68 = arith.addi %35, %c1 : index cf.br ^bb4(%68, %39 : index, vector<1x1x3xf32>) ^bb8: // pred: ^bb4 %69 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_10 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %69, %subview_10[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> %70 = arith.addi %33, %c3 : index cf.br ^bb2(%70 : index) ^bb9: // pred: ^bb2 %71 = arith.addi %31, %c1 : index cf.br ^bb1(%71 : index) ^bb10: // pred: ^bb1 return } // -----// IR Dump Before ExpandOps (memref-expand) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> cf.br ^bb1(%c0 : index) ^bb1(%31: index): // 2 preds: ^bb0, ^bb9 %32 = arith.cmpi slt, %31, %c2 : index cf.cond_br %32, ^bb2(%c0 : index), ^bb10 ^bb2(%33: index): // 2 preds: ^bb1, ^bb8 %34 = arith.cmpi slt, %33, %c9 : index cf.cond_br %34, ^bb3, ^bb9 ^bb3: // pred: ^bb2 %subview_2 = memref.subview %subview_1[%31, 0, 0, %33] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%31, 0, 0, %33] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> cf.br ^bb4(%c0, %cst : index, vector<1x1x3xf32>) ^bb4(%35: index, %36: vector<1x1x3xf32>): // 2 preds: ^bb3, ^bb7 %37 = arith.cmpi slt, %35, %c5 : index cf.cond_br %37, ^bb5(%c0, %36 : index, vector<1x1x3xf32>), ^bb8 ^bb5(%38: index, %39: vector<1x1x3xf32>): // 2 preds: ^bb4, ^bb6 %40 = arith.cmpi slt, %38, %c5 : index cf.cond_br %40, ^bb6, ^bb7 ^bb6: // pred: ^bb5 %subview_4 = memref.subview %subview_2[0, 0, %35, %38] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_5 = memref.subview %subview[0, 0, %35, %38] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_6 = memref.subview %subview_4[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %41 = vector.load %subview_8[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %42 = vector.load %subview_8[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %43 = vector.load %subview_8[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %44 = vector.load %subview_8[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %45 = vector.load %subview_8[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %46 = vector.load %subview_8[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %47 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32> %48 = memref.load %subview_9[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %49 = vector.broadcast %48 : f32 to vector<3xf32> %50 = vector.fma %41, %49, %47 : vector<3xf32> %51 = memref.load %subview_9[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %52 = vector.broadcast %51 : f32 to vector<3xf32> %53 = vector.fma %42, %52, %50 : vector<3xf32> %54 = memref.load %subview_9[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %55 = vector.broadcast %54 : f32 to vector<3xf32> %56 = vector.fma %43, %55, %53 : vector<3xf32> %57 = memref.load %subview_9[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %58 = vector.broadcast %57 : f32 to vector<3xf32> %59 = vector.fma %44, %58, %56 : vector<3xf32> %60 = memref.load %subview_9[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %61 = vector.broadcast %60 : f32 to vector<3xf32> %62 = vector.fma %45, %61, %59 : vector<3xf32> %63 = memref.load %subview_9[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %64 = vector.broadcast %63 : f32 to vector<3xf32> %65 = vector.fma %46, %64, %62 : vector<3xf32> %66 = vector.broadcast %65 : vector<3xf32> to vector<1x1x3xf32> %67 = arith.addi %38, %c1 : index cf.br ^bb5(%67, %66 : index, vector<1x1x3xf32>) ^bb7: // pred: ^bb5 %68 = arith.addi %35, %c1 : index cf.br ^bb4(%68, %39 : index, vector<1x1x3xf32>) ^bb8: // pred: ^bb4 %69 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_10 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %69, %subview_10[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> %70 = arith.addi %33, %c3 : index cf.br ^bb2(%70 : index) ^bb9: // pred: ^bb2 %71 = arith.addi %31, %c1 : index cf.br ^bb1(%71 : index) ^bb10: // pred: ^bb1 return } // -----// IR Dump After ExpandOps (memref-expand) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> cf.br ^bb1(%c0 : index) ^bb1(%31: index): // 2 preds: ^bb0, ^bb9 %32 = arith.cmpi slt, %31, %c2 : index cf.cond_br %32, ^bb2(%c0 : index), ^bb10 ^bb2(%33: index): // 2 preds: ^bb1, ^bb8 %34 = arith.cmpi slt, %33, %c9 : index cf.cond_br %34, ^bb3, ^bb9 ^bb3: // pred: ^bb2 %subview_2 = memref.subview %subview_1[%31, 0, 0, %33] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%31, 0, 0, %33] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> cf.br ^bb4(%c0, %cst : index, vector<1x1x3xf32>) ^bb4(%35: index, %36: vector<1x1x3xf32>): // 2 preds: ^bb3, ^bb7 %37 = arith.cmpi slt, %35, %c5 : index cf.cond_br %37, ^bb5(%c0, %36 : index, vector<1x1x3xf32>), ^bb8 ^bb5(%38: index, %39: vector<1x1x3xf32>): // 2 preds: ^bb4, ^bb6 %40 = arith.cmpi slt, %38, %c5 : index cf.cond_br %40, ^bb6, ^bb7 ^bb6: // pred: ^bb5 %subview_4 = memref.subview %subview_2[0, 0, %35, %38] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_5 = memref.subview %subview[0, 0, %35, %38] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_6 = memref.subview %subview_4[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %41 = vector.load %subview_8[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %42 = vector.load %subview_8[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %43 = vector.load %subview_8[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %44 = vector.load %subview_8[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %45 = vector.load %subview_8[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %46 = vector.load %subview_8[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %47 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32> %48 = memref.load %subview_9[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %49 = vector.broadcast %48 : f32 to vector<3xf32> %50 = vector.fma %41, %49, %47 : vector<3xf32> %51 = memref.load %subview_9[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %52 = vector.broadcast %51 : f32 to vector<3xf32> %53 = vector.fma %42, %52, %50 : vector<3xf32> %54 = memref.load %subview_9[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %55 = vector.broadcast %54 : f32 to vector<3xf32> %56 = vector.fma %43, %55, %53 : vector<3xf32> %57 = memref.load %subview_9[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %58 = vector.broadcast %57 : f32 to vector<3xf32> %59 = vector.fma %44, %58, %56 : vector<3xf32> %60 = memref.load %subview_9[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %61 = vector.broadcast %60 : f32 to vector<3xf32> %62 = vector.fma %45, %61, %59 : vector<3xf32> %63 = memref.load %subview_9[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %64 = vector.broadcast %63 : f32 to vector<3xf32> %65 = vector.fma %46, %64, %62 : vector<3xf32> %66 = vector.broadcast %65 : vector<3xf32> to vector<1x1x3xf32> %67 = arith.addi %38, %c1 : index cf.br ^bb5(%67, %66 : index, vector<1x1x3xf32>) ^bb7: // pred: ^bb5 %68 = arith.addi %35, %c1 : index cf.br ^bb4(%68, %39 : index, vector<1x1x3xf32>) ^bb8: // pred: ^bb4 %69 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_10 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %69, %subview_10[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> %70 = arith.addi %33, %c3 : index cf.br ^bb2(%70 : index) ^bb9: // pred: ^bb2 %71 = arith.addi %31, %c1 : index cf.br ^bb1(%71 : index) ^bb10: // pred: ^bb1 return } // -----// IR Dump Before FoldMemRefAliasOps (fold-memref-alias-ops) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> cf.br ^bb1(%c0 : index) ^bb1(%31: index): // 2 preds: ^bb0, ^bb9 %32 = arith.cmpi slt, %31, %c2 : index cf.cond_br %32, ^bb2(%c0 : index), ^bb10 ^bb2(%33: index): // 2 preds: ^bb1, ^bb8 %34 = arith.cmpi slt, %33, %c9 : index cf.cond_br %34, ^bb3, ^bb9 ^bb3: // pred: ^bb2 %subview_2 = memref.subview %subview_1[%31, 0, 0, %33] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%31, 0, 0, %33] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> cf.br ^bb4(%c0, %cst : index, vector<1x1x3xf32>) ^bb4(%35: index, %36: vector<1x1x3xf32>): // 2 preds: ^bb3, ^bb7 %37 = arith.cmpi slt, %35, %c5 : index cf.cond_br %37, ^bb5(%c0, %36 : index, vector<1x1x3xf32>), ^bb8 ^bb5(%38: index, %39: vector<1x1x3xf32>): // 2 preds: ^bb4, ^bb6 %40 = arith.cmpi slt, %38, %c5 : index cf.cond_br %40, ^bb6, ^bb7 ^bb6: // pred: ^bb5 %subview_4 = memref.subview %subview_2[0, 0, %35, %38] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> %subview_5 = memref.subview %subview[0, 0, %35, %38] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> %subview_6 = memref.subview %subview_4[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> %subview_8 = memref.subview %subview_6[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> %41 = vector.load %subview_8[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %42 = vector.load %subview_8[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %43 = vector.load %subview_8[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %44 = vector.load %subview_8[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %45 = vector.load %subview_8[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %46 = vector.load %subview_8[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32> %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %47 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32> %48 = memref.load %subview_9[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %49 = vector.broadcast %48 : f32 to vector<3xf32> %50 = vector.fma %41, %49, %47 : vector<3xf32> %51 = memref.load %subview_9[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %52 = vector.broadcast %51 : f32 to vector<3xf32> %53 = vector.fma %42, %52, %50 : vector<3xf32> %54 = memref.load %subview_9[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %55 = vector.broadcast %54 : f32 to vector<3xf32> %56 = vector.fma %43, %55, %53 : vector<3xf32> %57 = memref.load %subview_9[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %58 = vector.broadcast %57 : f32 to vector<3xf32> %59 = vector.fma %44, %58, %56 : vector<3xf32> %60 = memref.load %subview_9[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %61 = vector.broadcast %60 : f32 to vector<3xf32> %62 = vector.fma %45, %61, %59 : vector<3xf32> %63 = memref.load %subview_9[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>> %64 = vector.broadcast %63 : f32 to vector<3xf32> %65 = vector.fma %46, %64, %62 : vector<3xf32> %66 = vector.broadcast %65 : vector<3xf32> to vector<1x1x3xf32> %67 = arith.addi %38, %c1 : index cf.br ^bb5(%67, %66 : index, vector<1x1x3xf32>) ^bb7: // pred: ^bb5 %68 = arith.addi %35, %c1 : index cf.br ^bb4(%68, %39 : index, vector<1x1x3xf32>) ^bb8: // pred: ^bb4 %69 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32> %subview_10 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>> vector.store %69, %subview_10[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32> %70 = arith.addi %33, %c3 : index cf.br ^bb2(%70 : index) ^bb9: // pred: ^bb2 %71 = arith.addi %31, %c1 : index cf.br ^bb1(%71 : index) ^bb10: // pred: ^bb1 return } // -----// IR Dump After FoldMemRefAliasOps (fold-memref-alias-ops) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index cf.br ^bb1(%c0 : index) ^bb1(%31: index): // 2 preds: ^bb0, ^bb9 %32 = arith.cmpi slt, %31, %c2 : index cf.cond_br %32, ^bb2(%c0 : index), ^bb10 ^bb2(%33: index): // 2 preds: ^bb1, ^bb8 %34 = arith.cmpi slt, %33, %c9 : index cf.cond_br %34, ^bb3, ^bb9 ^bb3: // pred: ^bb2 cf.br ^bb4(%c0, %cst : index, vector<1x1x3xf32>) ^bb4(%35: index, %36: vector<1x1x3xf32>): // 2 preds: ^bb3, ^bb7 %37 = arith.cmpi slt, %35, %c5 : index cf.cond_br %37, ^bb5(%c0, %36 : index, vector<1x1x3xf32>), ^bb8 ^bb5(%38: index, %39: vector<1x1x3xf32>): // 2 preds: ^bb4, ^bb6 %40 = arith.cmpi slt, %38, %c5 : index cf.cond_br %40, ^bb6, ^bb7 ^bb6: // pred: ^bb5 %41 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%workgroup_id_x, %35] %42 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %43 = vector.load %30[%31, %c0, %41, %42] : memref, vector<3xf32> %44 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 1)>()[%workgroup_id_x, %35] %45 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %46 = vector.load %30[%31, %c0, %44, %45] : memref, vector<3xf32> %47 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 2)>()[%workgroup_id_x, %35] %48 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %49 = vector.load %30[%31, %c0, %47, %48] : memref, vector<3xf32> %50 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 3)>()[%workgroup_id_x, %35] %51 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %52 = vector.load %30[%31, %c0, %50, %51] : memref, vector<3xf32> %53 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 4)>()[%workgroup_id_x, %35] %54 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %55 = vector.load %30[%31, %c0, %53, %54] : memref, vector<3xf32> %56 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 5)>()[%workgroup_id_x, %35] %57 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %58 = vector.load %30[%31, %c0, %56, %57] : memref, vector<3xf32> %59 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32> %60 = memref.load %28[%workgroup_id_y, %c0, %35, %38] : memref<4x6x5x5xf32> %61 = vector.broadcast %60 : f32 to vector<3xf32> %62 = vector.fma %43, %61, %59 : vector<3xf32> %63 = memref.load %28[%workgroup_id_y, %c1, %35, %38] : memref<4x6x5x5xf32> %64 = vector.broadcast %63 : f32 to vector<3xf32> %65 = vector.fma %46, %64, %62 : vector<3xf32> %66 = memref.load %28[%workgroup_id_y, %c2, %35, %38] : memref<4x6x5x5xf32> %67 = vector.broadcast %66 : f32 to vector<3xf32> %68 = vector.fma %49, %67, %65 : vector<3xf32> %69 = memref.load %28[%workgroup_id_y, %c3, %35, %38] : memref<4x6x5x5xf32> %70 = vector.broadcast %69 : f32 to vector<3xf32> %71 = vector.fma %52, %70, %68 : vector<3xf32> %72 = memref.load %28[%workgroup_id_y, %c4, %35, %38] : memref<4x6x5x5xf32> %73 = vector.broadcast %72 : f32 to vector<3xf32> %74 = vector.fma %55, %73, %71 : vector<3xf32> %75 = memref.load %28[%workgroup_id_y, %c5, %35, %38] : memref<4x6x5x5xf32> %76 = vector.broadcast %75 : f32 to vector<3xf32> %77 = vector.fma %58, %76, %74 : vector<3xf32> %78 = vector.broadcast %77 : vector<3xf32> to vector<1x1x3xf32> %79 = arith.addi %38, %c1 : index cf.br ^bb5(%79, %78 : index, vector<1x1x3xf32>) ^bb7: // pred: ^bb5 %80 = arith.addi %35, %c1 : index cf.br ^bb4(%80, %39 : index, vector<1x1x3xf32>) ^bb8: // pred: ^bb4 %81 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32> vector.store %81, %29[%31, %workgroup_id_y, %workgroup_id_x, %33] : memref<2x4x7x9xf32>, vector<3xf32> %82 = arith.addi %33, %c3 : index cf.br ^bb2(%82 : index) ^bb9: // pred: ^bb2 %83 = arith.addi %31, %c1 : index cf.br ^bb1(%83 : index) ^bb10: // pred: ^bb1 return } // -----// IR Dump Before EmulateNarrowTypePass (iree-codegen-emulate-narrow-type) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index cf.br ^bb1(%c0 : index) ^bb1(%31: index): // 2 preds: ^bb0, ^bb9 %32 = arith.cmpi slt, %31, %c2 : index cf.cond_br %32, ^bb2(%c0 : index), ^bb10 ^bb2(%33: index): // 2 preds: ^bb1, ^bb8 %34 = arith.cmpi slt, %33, %c9 : index cf.cond_br %34, ^bb3, ^bb9 ^bb3: // pred: ^bb2 cf.br ^bb4(%c0, %cst : index, vector<1x1x3xf32>) ^bb4(%35: index, %36: vector<1x1x3xf32>): // 2 preds: ^bb3, ^bb7 %37 = arith.cmpi slt, %35, %c5 : index cf.cond_br %37, ^bb5(%c0, %36 : index, vector<1x1x3xf32>), ^bb8 ^bb5(%38: index, %39: vector<1x1x3xf32>): // 2 preds: ^bb4, ^bb6 %40 = arith.cmpi slt, %38, %c5 : index cf.cond_br %40, ^bb6, ^bb7 ^bb6: // pred: ^bb5 %41 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%workgroup_id_x, %35] %42 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %43 = vector.load %30[%31, %c0, %41, %42] : memref, vector<3xf32> %44 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 1)>()[%workgroup_id_x, %35] %45 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %46 = vector.load %30[%31, %c0, %44, %45] : memref, vector<3xf32> %47 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 2)>()[%workgroup_id_x, %35] %48 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %49 = vector.load %30[%31, %c0, %47, %48] : memref, vector<3xf32> %50 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 3)>()[%workgroup_id_x, %35] %51 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %52 = vector.load %30[%31, %c0, %50, %51] : memref, vector<3xf32> %53 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 4)>()[%workgroup_id_x, %35] %54 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %55 = vector.load %30[%31, %c0, %53, %54] : memref, vector<3xf32> %56 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 5)>()[%workgroup_id_x, %35] %57 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %58 = vector.load %30[%31, %c0, %56, %57] : memref, vector<3xf32> %59 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32> %60 = memref.load %28[%workgroup_id_y, %c0, %35, %38] : memref<4x6x5x5xf32> %61 = vector.broadcast %60 : f32 to vector<3xf32> %62 = vector.fma %43, %61, %59 : vector<3xf32> %63 = memref.load %28[%workgroup_id_y, %c1, %35, %38] : memref<4x6x5x5xf32> %64 = vector.broadcast %63 : f32 to vector<3xf32> %65 = vector.fma %46, %64, %62 : vector<3xf32> %66 = memref.load %28[%workgroup_id_y, %c2, %35, %38] : memref<4x6x5x5xf32> %67 = vector.broadcast %66 : f32 to vector<3xf32> %68 = vector.fma %49, %67, %65 : vector<3xf32> %69 = memref.load %28[%workgroup_id_y, %c3, %35, %38] : memref<4x6x5x5xf32> %70 = vector.broadcast %69 : f32 to vector<3xf32> %71 = vector.fma %52, %70, %68 : vector<3xf32> %72 = memref.load %28[%workgroup_id_y, %c4, %35, %38] : memref<4x6x5x5xf32> %73 = vector.broadcast %72 : f32 to vector<3xf32> %74 = vector.fma %55, %73, %71 : vector<3xf32> %75 = memref.load %28[%workgroup_id_y, %c5, %35, %38] : memref<4x6x5x5xf32> %76 = vector.broadcast %75 : f32 to vector<3xf32> %77 = vector.fma %58, %76, %74 : vector<3xf32> %78 = vector.broadcast %77 : vector<3xf32> to vector<1x1x3xf32> %79 = arith.addi %38, %c1 : index cf.br ^bb5(%79, %78 : index, vector<1x1x3xf32>) ^bb7: // pred: ^bb5 %80 = arith.addi %35, %c1 : index cf.br ^bb4(%80, %39 : index, vector<1x1x3xf32>) ^bb8: // pred: ^bb4 %81 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32> vector.store %81, %29[%31, %workgroup_id_y, %workgroup_id_x, %33] : memref<2x4x7x9xf32>, vector<3xf32> %82 = arith.addi %33, %c3 : index cf.br ^bb2(%82 : index) ^bb9: // pred: ^bb2 %83 = arith.addi %31, %c1 : index cf.br ^bb1(%83 : index) ^bb10: // pred: ^bb1 return } // -----// IR Dump After EmulateNarrowTypePass (iree-codegen-emulate-narrow-type) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index cf.br ^bb1(%c0 : index) ^bb1(%31: index): // 2 preds: ^bb0, ^bb9 %32 = arith.cmpi slt, %31, %c2 : index cf.cond_br %32, ^bb2(%c0 : index), ^bb10 ^bb2(%33: index): // 2 preds: ^bb1, ^bb8 %34 = arith.cmpi slt, %33, %c9 : index cf.cond_br %34, ^bb3, ^bb9 ^bb3: // pred: ^bb2 cf.br ^bb4(%c0, %cst : index, vector<1x1x3xf32>) ^bb4(%35: index, %36: vector<1x1x3xf32>): // 2 preds: ^bb3, ^bb7 %37 = arith.cmpi slt, %35, %c5 : index cf.cond_br %37, ^bb5(%c0, %36 : index, vector<1x1x3xf32>), ^bb8 ^bb5(%38: index, %39: vector<1x1x3xf32>): // 2 preds: ^bb4, ^bb6 %40 = arith.cmpi slt, %38, %c5 : index cf.cond_br %40, ^bb6, ^bb7 ^bb6: // pred: ^bb5 %41 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%workgroup_id_x, %35] %42 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %43 = vector.load %30[%31, %c0, %41, %42] : memref, vector<3xf32> %44 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 1)>()[%workgroup_id_x, %35] %45 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %46 = vector.load %30[%31, %c0, %44, %45] : memref, vector<3xf32> %47 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 2)>()[%workgroup_id_x, %35] %48 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %49 = vector.load %30[%31, %c0, %47, %48] : memref, vector<3xf32> %50 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 3)>()[%workgroup_id_x, %35] %51 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %52 = vector.load %30[%31, %c0, %50, %51] : memref, vector<3xf32> %53 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 4)>()[%workgroup_id_x, %35] %54 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %55 = vector.load %30[%31, %c0, %53, %54] : memref, vector<3xf32> %56 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 5)>()[%workgroup_id_x, %35] %57 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %58 = vector.load %30[%31, %c0, %56, %57] : memref, vector<3xf32> %59 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32> %60 = memref.load %28[%workgroup_id_y, %c0, %35, %38] : memref<4x6x5x5xf32> %61 = vector.broadcast %60 : f32 to vector<3xf32> %62 = vector.fma %43, %61, %59 : vector<3xf32> %63 = memref.load %28[%workgroup_id_y, %c1, %35, %38] : memref<4x6x5x5xf32> %64 = vector.broadcast %63 : f32 to vector<3xf32> %65 = vector.fma %46, %64, %62 : vector<3xf32> %66 = memref.load %28[%workgroup_id_y, %c2, %35, %38] : memref<4x6x5x5xf32> %67 = vector.broadcast %66 : f32 to vector<3xf32> %68 = vector.fma %49, %67, %65 : vector<3xf32> %69 = memref.load %28[%workgroup_id_y, %c3, %35, %38] : memref<4x6x5x5xf32> %70 = vector.broadcast %69 : f32 to vector<3xf32> %71 = vector.fma %52, %70, %68 : vector<3xf32> %72 = memref.load %28[%workgroup_id_y, %c4, %35, %38] : memref<4x6x5x5xf32> %73 = vector.broadcast %72 : f32 to vector<3xf32> %74 = vector.fma %55, %73, %71 : vector<3xf32> %75 = memref.load %28[%workgroup_id_y, %c5, %35, %38] : memref<4x6x5x5xf32> %76 = vector.broadcast %75 : f32 to vector<3xf32> %77 = vector.fma %58, %76, %74 : vector<3xf32> %78 = vector.broadcast %77 : vector<3xf32> to vector<1x1x3xf32> %79 = arith.addi %38, %c1 : index cf.br ^bb5(%79, %78 : index, vector<1x1x3xf32>) ^bb7: // pred: ^bb5 %80 = arith.addi %35, %c1 : index cf.br ^bb4(%80, %39 : index, vector<1x1x3xf32>) ^bb8: // pred: ^bb4 %81 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32> vector.store %81, %29[%31, %workgroup_id_y, %workgroup_id_x, %33] : memref<2x4x7x9xf32>, vector<3xf32> %82 = arith.addi %33, %c3 : index cf.br ^bb2(%82 : index) ^bb9: // pred: ^bb2 %83 = arith.addi %31, %c1 : index cf.br ^bb1(%83 : index) ^bb10: // pred: ^bb1 return } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index cf.br ^bb1(%c0 : index) ^bb1(%31: index): // 2 preds: ^bb0, ^bb9 %32 = arith.cmpi slt, %31, %c2 : index cf.cond_br %32, ^bb2(%c0 : index), ^bb10 ^bb2(%33: index): // 2 preds: ^bb1, ^bb8 %34 = arith.cmpi slt, %33, %c9 : index cf.cond_br %34, ^bb3, ^bb9 ^bb3: // pred: ^bb2 cf.br ^bb4(%c0, %cst : index, vector<1x1x3xf32>) ^bb4(%35: index, %36: vector<1x1x3xf32>): // 2 preds: ^bb3, ^bb7 %37 = arith.cmpi slt, %35, %c5 : index cf.cond_br %37, ^bb5(%c0, %36 : index, vector<1x1x3xf32>), ^bb8 ^bb5(%38: index, %39: vector<1x1x3xf32>): // 2 preds: ^bb4, ^bb6 %40 = arith.cmpi slt, %38, %c5 : index cf.cond_br %40, ^bb6, ^bb7 ^bb6: // pred: ^bb5 %41 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%workgroup_id_x, %35] %42 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %43 = vector.load %30[%31, %c0, %41, %42] : memref, vector<3xf32> %44 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 1)>()[%workgroup_id_x, %35] %45 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %46 = vector.load %30[%31, %c0, %44, %45] : memref, vector<3xf32> %47 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 2)>()[%workgroup_id_x, %35] %48 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %49 = vector.load %30[%31, %c0, %47, %48] : memref, vector<3xf32> %50 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 3)>()[%workgroup_id_x, %35] %51 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %52 = vector.load %30[%31, %c0, %50, %51] : memref, vector<3xf32> %53 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 4)>()[%workgroup_id_x, %35] %54 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %55 = vector.load %30[%31, %c0, %53, %54] : memref, vector<3xf32> %56 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 5)>()[%workgroup_id_x, %35] %57 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %58 = vector.load %30[%31, %c0, %56, %57] : memref, vector<3xf32> %59 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32> %60 = memref.load %28[%workgroup_id_y, %c0, %35, %38] : memref<4x6x5x5xf32> %61 = vector.broadcast %60 : f32 to vector<3xf32> %62 = vector.fma %43, %61, %59 : vector<3xf32> %63 = memref.load %28[%workgroup_id_y, %c1, %35, %38] : memref<4x6x5x5xf32> %64 = vector.broadcast %63 : f32 to vector<3xf32> %65 = vector.fma %46, %64, %62 : vector<3xf32> %66 = memref.load %28[%workgroup_id_y, %c2, %35, %38] : memref<4x6x5x5xf32> %67 = vector.broadcast %66 : f32 to vector<3xf32> %68 = vector.fma %49, %67, %65 : vector<3xf32> %69 = memref.load %28[%workgroup_id_y, %c3, %35, %38] : memref<4x6x5x5xf32> %70 = vector.broadcast %69 : f32 to vector<3xf32> %71 = vector.fma %52, %70, %68 : vector<3xf32> %72 = memref.load %28[%workgroup_id_y, %c4, %35, %38] : memref<4x6x5x5xf32> %73 = vector.broadcast %72 : f32 to vector<3xf32> %74 = vector.fma %55, %73, %71 : vector<3xf32> %75 = memref.load %28[%workgroup_id_y, %c5, %35, %38] : memref<4x6x5x5xf32> %76 = vector.broadcast %75 : f32 to vector<3xf32> %77 = vector.fma %58, %76, %74 : vector<3xf32> %78 = vector.broadcast %77 : vector<3xf32> to vector<1x1x3xf32> %79 = arith.addi %38, %c1 : index cf.br ^bb5(%79, %78 : index, vector<1x1x3xf32>) ^bb7: // pred: ^bb5 %80 = arith.addi %35, %c1 : index cf.br ^bb4(%80, %39 : index, vector<1x1x3xf32>) ^bb8: // pred: ^bb4 %81 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32> vector.store %81, %29[%31, %workgroup_id_y, %workgroup_id_x, %33] : memref<2x4x7x9xf32>, vector<3xf32> %82 = arith.addi %33, %c3 : index cf.br ^bb2(%82 : index) ^bb9: // pred: ^bb2 %83 = arith.addi %31, %c1 : index cf.br ^bb1(%83 : index) ^bb10: // pred: ^bb1 return } // -----// IR Dump After Canonicalizer (canonicalize) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index cf.br ^bb1(%c0 : index) ^bb1(%31: index): // 2 preds: ^bb0, ^bb8 %32 = arith.cmpi slt, %31, %c2 : index cf.cond_br %32, ^bb2(%c0 : index), ^bb9 ^bb2(%33: index): // 2 preds: ^bb1, ^bb7 %34 = arith.cmpi slt, %33, %c9 : index cf.cond_br %34, ^bb3(%c0, %cst : index, vector<1x1x3xf32>), ^bb8 ^bb3(%35: index, %36: vector<1x1x3xf32>): // 2 preds: ^bb2, ^bb6 %37 = arith.cmpi slt, %35, %c5 : index cf.cond_br %37, ^bb4(%c0, %36 : index, vector<1x1x3xf32>), ^bb7 ^bb4(%38: index, %39: vector<1x1x3xf32>): // 2 preds: ^bb3, ^bb5 %40 = arith.cmpi slt, %38, %c5 : index cf.cond_br %40, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %41 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%workgroup_id_x, %35] %42 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %43 = vector.load %30[%31, %c0, %41, %42] : memref, vector<3xf32> %44 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 1)>()[%workgroup_id_x, %35] %45 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %46 = vector.load %30[%31, %c0, %44, %45] : memref, vector<3xf32> %47 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 2)>()[%workgroup_id_x, %35] %48 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %49 = vector.load %30[%31, %c0, %47, %48] : memref, vector<3xf32> %50 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 3)>()[%workgroup_id_x, %35] %51 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %52 = vector.load %30[%31, %c0, %50, %51] : memref, vector<3xf32> %53 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 4)>()[%workgroup_id_x, %35] %54 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %55 = vector.load %30[%31, %c0, %53, %54] : memref, vector<3xf32> %56 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 5)>()[%workgroup_id_x, %35] %57 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %58 = vector.load %30[%31, %c0, %56, %57] : memref, vector<3xf32> %59 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32> %60 = memref.load %28[%workgroup_id_y, %c0, %35, %38] : memref<4x6x5x5xf32> %61 = vector.broadcast %60 : f32 to vector<3xf32> %62 = vector.fma %43, %61, %59 : vector<3xf32> %63 = memref.load %28[%workgroup_id_y, %c1, %35, %38] : memref<4x6x5x5xf32> %64 = vector.broadcast %63 : f32 to vector<3xf32> %65 = vector.fma %46, %64, %62 : vector<3xf32> %66 = memref.load %28[%workgroup_id_y, %c2, %35, %38] : memref<4x6x5x5xf32> %67 = vector.broadcast %66 : f32 to vector<3xf32> %68 = vector.fma %49, %67, %65 : vector<3xf32> %69 = memref.load %28[%workgroup_id_y, %c3, %35, %38] : memref<4x6x5x5xf32> %70 = vector.broadcast %69 : f32 to vector<3xf32> %71 = vector.fma %52, %70, %68 : vector<3xf32> %72 = memref.load %28[%workgroup_id_y, %c4, %35, %38] : memref<4x6x5x5xf32> %73 = vector.broadcast %72 : f32 to vector<3xf32> %74 = vector.fma %55, %73, %71 : vector<3xf32> %75 = memref.load %28[%workgroup_id_y, %c5, %35, %38] : memref<4x6x5x5xf32> %76 = vector.broadcast %75 : f32 to vector<3xf32> %77 = vector.fma %58, %76, %74 : vector<3xf32> %78 = vector.broadcast %77 : vector<3xf32> to vector<1x1x3xf32> %79 = arith.addi %38, %c1 : index cf.br ^bb4(%79, %78 : index, vector<1x1x3xf32>) ^bb6: // pred: ^bb4 %80 = arith.addi %35, %c1 : index cf.br ^bb3(%80, %39 : index, vector<1x1x3xf32>) ^bb7: // pred: ^bb3 %81 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32> vector.store %81, %29[%31, %workgroup_id_y, %workgroup_id_x, %33] : memref<2x4x7x9xf32>, vector<3xf32> %82 = arith.addi %33, %c3 : index cf.br ^bb2(%82 : index) ^bb8: // pred: ^bb2 %83 = arith.addi %31, %c1 : index cf.br ^bb1(%83 : index) ^bb9: // pred: ^bb1 return } // -----// IR Dump Before CSE (cse) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index cf.br ^bb1(%c0 : index) ^bb1(%31: index): // 2 preds: ^bb0, ^bb8 %32 = arith.cmpi slt, %31, %c2 : index cf.cond_br %32, ^bb2(%c0 : index), ^bb9 ^bb2(%33: index): // 2 preds: ^bb1, ^bb7 %34 = arith.cmpi slt, %33, %c9 : index cf.cond_br %34, ^bb3(%c0, %cst : index, vector<1x1x3xf32>), ^bb8 ^bb3(%35: index, %36: vector<1x1x3xf32>): // 2 preds: ^bb2, ^bb6 %37 = arith.cmpi slt, %35, %c5 : index cf.cond_br %37, ^bb4(%c0, %36 : index, vector<1x1x3xf32>), ^bb7 ^bb4(%38: index, %39: vector<1x1x3xf32>): // 2 preds: ^bb3, ^bb5 %40 = arith.cmpi slt, %38, %c5 : index cf.cond_br %40, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %41 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%workgroup_id_x, %35] %42 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %43 = vector.load %30[%31, %c0, %41, %42] : memref, vector<3xf32> %44 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 1)>()[%workgroup_id_x, %35] %45 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %46 = vector.load %30[%31, %c0, %44, %45] : memref, vector<3xf32> %47 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 2)>()[%workgroup_id_x, %35] %48 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %49 = vector.load %30[%31, %c0, %47, %48] : memref, vector<3xf32> %50 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 3)>()[%workgroup_id_x, %35] %51 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %52 = vector.load %30[%31, %c0, %50, %51] : memref, vector<3xf32> %53 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 4)>()[%workgroup_id_x, %35] %54 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %55 = vector.load %30[%31, %c0, %53, %54] : memref, vector<3xf32> %56 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 5)>()[%workgroup_id_x, %35] %57 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %58 = vector.load %30[%31, %c0, %56, %57] : memref, vector<3xf32> %59 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32> %60 = memref.load %28[%workgroup_id_y, %c0, %35, %38] : memref<4x6x5x5xf32> %61 = vector.broadcast %60 : f32 to vector<3xf32> %62 = vector.fma %43, %61, %59 : vector<3xf32> %63 = memref.load %28[%workgroup_id_y, %c1, %35, %38] : memref<4x6x5x5xf32> %64 = vector.broadcast %63 : f32 to vector<3xf32> %65 = vector.fma %46, %64, %62 : vector<3xf32> %66 = memref.load %28[%workgroup_id_y, %c2, %35, %38] : memref<4x6x5x5xf32> %67 = vector.broadcast %66 : f32 to vector<3xf32> %68 = vector.fma %49, %67, %65 : vector<3xf32> %69 = memref.load %28[%workgroup_id_y, %c3, %35, %38] : memref<4x6x5x5xf32> %70 = vector.broadcast %69 : f32 to vector<3xf32> %71 = vector.fma %52, %70, %68 : vector<3xf32> %72 = memref.load %28[%workgroup_id_y, %c4, %35, %38] : memref<4x6x5x5xf32> %73 = vector.broadcast %72 : f32 to vector<3xf32> %74 = vector.fma %55, %73, %71 : vector<3xf32> %75 = memref.load %28[%workgroup_id_y, %c5, %35, %38] : memref<4x6x5x5xf32> %76 = vector.broadcast %75 : f32 to vector<3xf32> %77 = vector.fma %58, %76, %74 : vector<3xf32> %78 = vector.broadcast %77 : vector<3xf32> to vector<1x1x3xf32> %79 = arith.addi %38, %c1 : index cf.br ^bb4(%79, %78 : index, vector<1x1x3xf32>) ^bb6: // pred: ^bb4 %80 = arith.addi %35, %c1 : index cf.br ^bb3(%80, %39 : index, vector<1x1x3xf32>) ^bb7: // pred: ^bb3 %81 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32> vector.store %81, %29[%31, %workgroup_id_y, %workgroup_id_x, %33] : memref<2x4x7x9xf32>, vector<3xf32> %82 = arith.addi %33, %c3 : index cf.br ^bb2(%82 : index) ^bb8: // pred: ^bb2 %83 = arith.addi %31, %c1 : index cf.br ^bb1(%83 : index) ^bb9: // pred: ^bb1 return } // -----// IR Dump After CSE (cse) //----- // func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index cf.br ^bb1(%c0 : index) ^bb1(%31: index): // 2 preds: ^bb0, ^bb8 %32 = arith.cmpi slt, %31, %c2 : index cf.cond_br %32, ^bb2(%c0 : index), ^bb9 ^bb2(%33: index): // 2 preds: ^bb1, ^bb7 %34 = arith.cmpi slt, %33, %c9 : index cf.cond_br %34, ^bb3(%c0, %cst : index, vector<1x1x3xf32>), ^bb8 ^bb3(%35: index, %36: vector<1x1x3xf32>): // 2 preds: ^bb2, ^bb6 %37 = arith.cmpi slt, %35, %c5 : index cf.cond_br %37, ^bb4(%c0, %36 : index, vector<1x1x3xf32>), ^bb7 ^bb4(%38: index, %39: vector<1x1x3xf32>): // 2 preds: ^bb3, ^bb5 %40 = arith.cmpi slt, %38, %c5 : index cf.cond_br %40, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %41 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%workgroup_id_x, %35] %42 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %43 = vector.load %30[%31, %c0, %41, %42] : memref, vector<3xf32> %44 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 1)>()[%workgroup_id_x, %35] %45 = vector.load %30[%31, %c0, %44, %42] : memref, vector<3xf32> %46 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 2)>()[%workgroup_id_x, %35] %47 = vector.load %30[%31, %c0, %46, %42] : memref, vector<3xf32> %48 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 3)>()[%workgroup_id_x, %35] %49 = vector.load %30[%31, %c0, %48, %42] : memref, vector<3xf32> %50 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 4)>()[%workgroup_id_x, %35] %51 = vector.load %30[%31, %c0, %50, %42] : memref, vector<3xf32> %52 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 5)>()[%workgroup_id_x, %35] %53 = vector.load %30[%31, %c0, %52, %42] : memref, vector<3xf32> %54 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32> %55 = memref.load %28[%workgroup_id_y, %c0, %35, %38] : memref<4x6x5x5xf32> %56 = vector.broadcast %55 : f32 to vector<3xf32> %57 = vector.fma %43, %56, %54 : vector<3xf32> %58 = memref.load %28[%workgroup_id_y, %c1, %35, %38] : memref<4x6x5x5xf32> %59 = vector.broadcast %58 : f32 to vector<3xf32> %60 = vector.fma %45, %59, %57 : vector<3xf32> %61 = memref.load %28[%workgroup_id_y, %c2, %35, %38] : memref<4x6x5x5xf32> %62 = vector.broadcast %61 : f32 to vector<3xf32> %63 = vector.fma %47, %62, %60 : vector<3xf32> %64 = memref.load %28[%workgroup_id_y, %c3, %35, %38] : memref<4x6x5x5xf32> %65 = vector.broadcast %64 : f32 to vector<3xf32> %66 = vector.fma %49, %65, %63 : vector<3xf32> %67 = memref.load %28[%workgroup_id_y, %c4, %35, %38] : memref<4x6x5x5xf32> %68 = vector.broadcast %67 : f32 to vector<3xf32> %69 = vector.fma %51, %68, %66 : vector<3xf32> %70 = memref.load %28[%workgroup_id_y, %c5, %35, %38] : memref<4x6x5x5xf32> %71 = vector.broadcast %70 : f32 to vector<3xf32> %72 = vector.fma %53, %71, %69 : vector<3xf32> %73 = vector.broadcast %72 : vector<3xf32> to vector<1x1x3xf32> %74 = arith.addi %38, %c1 : index cf.br ^bb4(%74, %73 : index, vector<1x1x3xf32>) ^bb6: // pred: ^bb4 %75 = arith.addi %35, %c1 : index cf.br ^bb3(%75, %39 : index, vector<1x1x3xf32>) ^bb7: // pred: ^bb3 %76 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32> vector.store %76, %29[%31, %workgroup_id_y, %workgroup_id_x, %33] : memref<2x4x7x9xf32>, vector<3xf32> %77 = arith.addi %33, %c3 : index cf.br ^bb2(%77 : index) ^bb8: // pred: ^bb2 %78 = arith.addi %31, %c1 : index cf.br ^bb1(%78 : index) ^bb9: // pred: ^bb1 return } // -----// IR Dump Before ConvertToLLVMPass (iree-convert-to-llvm) //----- // module { func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> %c5 = arith.constant 5 : index %c3 = arith.constant 3 : index %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32 %3 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32 %4 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32 %5 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32 %6 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32 %7 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32 %8 = arith.extui %0 : i32 to i64 %9 = arith.extui %1 : i32 to i64 %10 = arith.shli %9, %c32_i64 : i64 %11 = arith.ori %8, %10 : i64 %12 = arith.index_castui %11 : i64 to index %13 = arith.extui %2 : i32 to i64 %14 = arith.extui %3 : i32 to i64 %15 = arith.shli %14, %c32_i64 : i64 %16 = arith.ori %13, %15 : i64 %17 = arith.index_castui %16 : i64 to index %18 = arith.extui %4 : i32 to i64 %19 = arith.extui %5 : i32 to i64 %20 = arith.shli %19, %c32_i64 : i64 %21 = arith.ori %18, %20 : i64 %22 = arith.index_castui %21 : i64 to index %23 = arith.extui %6 : i32 to i64 %24 = arith.extui %7 : i32 to i64 %25 = arith.shli %24, %c32_i64 : i64 %26 = arith.ori %23, %25 : i64 %27 = arith.index_castui %26 : i64 to index %28 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32> memref.assume_alignment %28, 64 : memref<4x6x5x5xf32> %29 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32> memref.assume_alignment %29, 64 : memref<2x4x7x9xf32> %30 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref{%12, %17, %22, %27} memref.assume_alignment %30, 64 : memref %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index cf.br ^bb1(%c0 : index) ^bb1(%31: index): // 2 preds: ^bb0, ^bb8 %32 = arith.cmpi slt, %31, %c2 : index cf.cond_br %32, ^bb2(%c0 : index), ^bb9 ^bb2(%33: index): // 2 preds: ^bb1, ^bb7 %34 = arith.cmpi slt, %33, %c9 : index cf.cond_br %34, ^bb3(%c0, %cst : index, vector<1x1x3xf32>), ^bb8 ^bb3(%35: index, %36: vector<1x1x3xf32>): // 2 preds: ^bb2, ^bb6 %37 = arith.cmpi slt, %35, %c5 : index cf.cond_br %37, ^bb4(%c0, %36 : index, vector<1x1x3xf32>), ^bb7 ^bb4(%38: index, %39: vector<1x1x3xf32>): // 2 preds: ^bb3, ^bb5 %40 = arith.cmpi slt, %38, %c5 : index cf.cond_br %40, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %41 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%workgroup_id_x, %35] %42 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38] %43 = vector.load %30[%31, %c0, %41, %42] : memref, vector<3xf32> %44 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 1)>()[%workgroup_id_x, %35] %45 = vector.load %30[%31, %c0, %44, %42] : memref, vector<3xf32> %46 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 2)>()[%workgroup_id_x, %35] %47 = vector.load %30[%31, %c0, %46, %42] : memref, vector<3xf32> %48 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 3)>()[%workgroup_id_x, %35] %49 = vector.load %30[%31, %c0, %48, %42] : memref, vector<3xf32> %50 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 4)>()[%workgroup_id_x, %35] %51 = vector.load %30[%31, %c0, %50, %42] : memref, vector<3xf32> %52 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 5)>()[%workgroup_id_x, %35] %53 = vector.load %30[%31, %c0, %52, %42] : memref, vector<3xf32> %54 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32> %55 = memref.load %28[%workgroup_id_y, %c0, %35, %38] : memref<4x6x5x5xf32> %56 = vector.broadcast %55 : f32 to vector<3xf32> %57 = vector.fma %43, %56, %54 : vector<3xf32> %58 = memref.load %28[%workgroup_id_y, %c1, %35, %38] : memref<4x6x5x5xf32> %59 = vector.broadcast %58 : f32 to vector<3xf32> %60 = vector.fma %45, %59, %57 : vector<3xf32> %61 = memref.load %28[%workgroup_id_y, %c2, %35, %38] : memref<4x6x5x5xf32> %62 = vector.broadcast %61 : f32 to vector<3xf32> %63 = vector.fma %47, %62, %60 : vector<3xf32> %64 = memref.load %28[%workgroup_id_y, %c3, %35, %38] : memref<4x6x5x5xf32> %65 = vector.broadcast %64 : f32 to vector<3xf32> %66 = vector.fma %49, %65, %63 : vector<3xf32> %67 = memref.load %28[%workgroup_id_y, %c4, %35, %38] : memref<4x6x5x5xf32> %68 = vector.broadcast %67 : f32 to vector<3xf32> %69 = vector.fma %51, %68, %66 : vector<3xf32> %70 = memref.load %28[%workgroup_id_y, %c5, %35, %38] : memref<4x6x5x5xf32> %71 = vector.broadcast %70 : f32 to vector<3xf32> %72 = vector.fma %53, %71, %69 : vector<3xf32> %73 = vector.broadcast %72 : vector<3xf32> to vector<1x1x3xf32> %74 = arith.addi %38, %c1 : index cf.br ^bb4(%74, %73 : index, vector<1x1x3xf32>) ^bb6: // pred: ^bb4 %75 = arith.addi %35, %c1 : index cf.br ^bb3(%75, %39 : index, vector<1x1x3xf32>) ^bb7: // pred: ^bb3 %76 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32> vector.store %76, %29[%31, %workgroup_id_y, %workgroup_id_x, %33] : memref<2x4x7x9xf32>, vector<3xf32> %77 = arith.addi %33, %c3 : index cf.br ^bb2(%77 : index) ^bb8: // pred: ^bb2 %78 = arith.addi %31, %c1 : index cf.br ^bb1(%78 : index) ^bb9: // pred: ^bb1 return } } // -----// IR Dump After ConvertToLLVMPass (iree-convert-to-llvm) //----- // module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %22 = llvm.extractvalue %21[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %23 = llvm.getelementptr %22[3] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %26 = llvm.extractvalue %25[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %27 = llvm.getelementptr %26[4] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %30 = llvm.extractvalue %29[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %31 = llvm.getelementptr %30[5] : (!llvm.ptr) -> !llvm.ptr, i32 %32 = llvm.load %31 : !llvm.ptr -> i32 %33 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %34 = llvm.extractvalue %33[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %35 = llvm.getelementptr %34[6] : (!llvm.ptr) -> !llvm.ptr, i32 %36 = llvm.load %35 : !llvm.ptr -> i32 %37 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %38 = llvm.extractvalue %37[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %39 = llvm.getelementptr %38[7] : (!llvm.ptr) -> !llvm.ptr, i32 %40 = llvm.load %39 : !llvm.ptr -> i32 %41 = llvm.zext %20 : i32 to i64 %42 = llvm.zext %24 : i32 to i64 %43 = llvm.shl %42, %16 : i64 %44 = llvm.or %41, %43 : i64 %45 = llvm.zext %28 : i32 to i64 %46 = llvm.zext %32 : i32 to i64 %47 = llvm.shl %46, %16 : i64 %48 = llvm.or %45, %47 : i64 %49 = llvm.zext %36 : i32 to i64 %50 = llvm.zext %40 : i32 to i64 %51 = llvm.shl %50, %16 : i64 %52 = llvm.or %49, %51 : i64 %53 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %54 = llvm.extractvalue %53[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %55 = llvm.getelementptr %54[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %56 = llvm.load %55 : !llvm.ptr -> !llvm.ptr %57 = llvm.ptrtoint %56 : !llvm.ptr to i64 %58 = llvm.and %57, %4 : i64 %59 = llvm.icmp "eq" %58, %15 : i64 "llvm.intr.assume"(%59) : (i1) -> () %60 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %61 = llvm.extractvalue %60[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %62 = llvm.getelementptr %61[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %63 = llvm.load %62 : !llvm.ptr -> !llvm.ptr %64 = llvm.ptrtoint %63 : !llvm.ptr to i64 %65 = llvm.and %64, %4 : i64 %66 = llvm.icmp "eq" %65, %15 : i64 "llvm.intr.assume"(%66) : (i1) -> () %67 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %68 = llvm.extractvalue %67[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %69 = llvm.load %68 : !llvm.ptr -> !llvm.ptr %70 = llvm.mul %52, %2 : i64 %71 = llvm.mul %70, %48 : i64 %72 = llvm.mul %71, %44 : i64 %73 = llvm.ptrtoint %69 : !llvm.ptr to i64 %74 = llvm.and %73, %4 : i64 %75 = llvm.icmp "eq" %74, %15 : i64 "llvm.intr.assume"(%75) : (i1) -> () %76 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %77 = llvm.extractvalue %76[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %78 = llvm.zext %77 : i32 to i64 %79 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %80 = llvm.extractvalue %79[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %81 = llvm.zext %80 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%82: i64): // 2 preds: ^bb0, ^bb8 %83 = llvm.icmp "slt" %82, %14 : i64 llvm.cond_br %83, ^bb2(%15 : i64), ^bb9 ^bb2(%84: i64): // 2 preds: ^bb1, ^bb7 %85 = llvm.icmp "slt" %84, %12 : i64 llvm.cond_br %85, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%86: i64, %87: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %88 = llvm.icmp "slt" %86, %10 : i64 llvm.cond_br %88, ^bb4(%15, %87 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%89: i64, %90: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %91 = llvm.icmp "slt" %89, %10 : i64 llvm.cond_br %91, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %92 = llvm.add %78, %86 : i64 %93 = llvm.add %84, %89 : i64 %94 = llvm.mul %82, %72 : i64 %95 = llvm.mul %71, %15 : i64 %96 = llvm.add %94, %95 : i64 %97 = llvm.mul %92, %70 : i64 %98 = llvm.add %96, %97 : i64 %99 = llvm.add %98, %93 : i64 %100 = llvm.getelementptr %69[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %78, %86 : i64 %103 = llvm.add %102, %13 : i64 %104 = llvm.mul %82, %72 : i64 %105 = llvm.mul %71, %15 : i64 %106 = llvm.add %104, %105 : i64 %107 = llvm.mul %103, %70 : i64 %108 = llvm.add %106, %107 : i64 %109 = llvm.add %108, %93 : i64 %110 = llvm.getelementptr %69[%109] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %111 = llvm.load %110 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %112 = llvm.add %78, %86 : i64 %113 = llvm.add %112, %14 : i64 %114 = llvm.mul %82, %72 : i64 %115 = llvm.mul %71, %15 : i64 %116 = llvm.add %114, %115 : i64 %117 = llvm.mul %113, %70 : i64 %118 = llvm.add %116, %117 : i64 %119 = llvm.add %118, %93 : i64 %120 = llvm.getelementptr %69[%119] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %121 = llvm.load %120 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %122 = llvm.add %78, %86 : i64 %123 = llvm.add %122, %11 : i64 %124 = llvm.mul %82, %72 : i64 %125 = llvm.mul %71, %15 : i64 %126 = llvm.add %124, %125 : i64 %127 = llvm.mul %123, %70 : i64 %128 = llvm.add %126, %127 : i64 %129 = llvm.add %128, %93 : i64 %130 = llvm.getelementptr %69[%129] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %131 = llvm.load %130 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %132 = llvm.add %78, %86 : i64 %133 = llvm.add %132, %8 : i64 %134 = llvm.mul %82, %72 : i64 %135 = llvm.mul %71, %15 : i64 %136 = llvm.add %134, %135 : i64 %137 = llvm.mul %133, %70 : i64 %138 = llvm.add %136, %137 : i64 %139 = llvm.add %138, %93 : i64 %140 = llvm.getelementptr %69[%139] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %141 = llvm.load %140 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %142 = llvm.add %78, %86 : i64 %143 = llvm.add %142, %10 : i64 %144 = llvm.mul %82, %72 : i64 %145 = llvm.mul %71, %15 : i64 %146 = llvm.add %144, %145 : i64 %147 = llvm.mul %143, %70 : i64 %148 = llvm.add %146, %147 : i64 %149 = llvm.add %148, %93 : i64 %150 = llvm.getelementptr %69[%149] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %151 = llvm.load %150 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %152 = llvm.extractvalue %90[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %153 = llvm.mul %81, %6 : i64 %154 = llvm.mul %15, %5 : i64 %155 = llvm.add %153, %154 : i64 %156 = llvm.mul %86, %10 : i64 %157 = llvm.add %155, %156 : i64 %158 = llvm.add %157, %89 : i64 %159 = llvm.getelementptr %56[%158] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %160 = llvm.load %159 : !llvm.ptr -> f32 %161 = llvm.insertelement %160, %1[%0 : i32] : vector<3xf32> %162 = llvm.shufflevector %161, %1 [0, 0, 0] : vector<3xf32> %163 = llvm.intr.fmuladd(%101, %162, %152) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %164 = llvm.mul %81, %6 : i64 %165 = llvm.mul %13, %5 : i64 %166 = llvm.add %164, %165 : i64 %167 = llvm.mul %86, %10 : i64 %168 = llvm.add %166, %167 : i64 %169 = llvm.add %168, %89 : i64 %170 = llvm.getelementptr %56[%169] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %171 = llvm.load %170 : !llvm.ptr -> f32 %172 = llvm.insertelement %171, %1[%0 : i32] : vector<3xf32> %173 = llvm.shufflevector %172, %1 [0, 0, 0] : vector<3xf32> %174 = llvm.intr.fmuladd(%111, %173, %163) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %175 = llvm.mul %81, %6 : i64 %176 = llvm.mul %14, %5 : i64 %177 = llvm.add %175, %176 : i64 %178 = llvm.mul %86, %10 : i64 %179 = llvm.add %177, %178 : i64 %180 = llvm.add %179, %89 : i64 %181 = llvm.getelementptr %56[%180] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %182 = llvm.load %181 : !llvm.ptr -> f32 %183 = llvm.insertelement %182, %1[%0 : i32] : vector<3xf32> %184 = llvm.shufflevector %183, %1 [0, 0, 0] : vector<3xf32> %185 = llvm.intr.fmuladd(%121, %184, %174) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %186 = llvm.mul %81, %6 : i64 %187 = llvm.mul %11, %5 : i64 %188 = llvm.add %186, %187 : i64 %189 = llvm.mul %86, %10 : i64 %190 = llvm.add %188, %189 : i64 %191 = llvm.add %190, %89 : i64 %192 = llvm.getelementptr %56[%191] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %193 = llvm.load %192 : !llvm.ptr -> f32 %194 = llvm.insertelement %193, %1[%0 : i32] : vector<3xf32> %195 = llvm.shufflevector %194, %1 [0, 0, 0] : vector<3xf32> %196 = llvm.intr.fmuladd(%131, %195, %185) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %197 = llvm.mul %81, %6 : i64 %198 = llvm.mul %8, %5 : i64 %199 = llvm.add %197, %198 : i64 %200 = llvm.mul %86, %10 : i64 %201 = llvm.add %199, %200 : i64 %202 = llvm.add %201, %89 : i64 %203 = llvm.getelementptr %56[%202] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %204 = llvm.load %203 : !llvm.ptr -> f32 %205 = llvm.insertelement %204, %1[%0 : i32] : vector<3xf32> %206 = llvm.shufflevector %205, %1 [0, 0, 0] : vector<3xf32> %207 = llvm.intr.fmuladd(%141, %206, %196) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %208 = llvm.mul %81, %6 : i64 %209 = llvm.mul %10, %5 : i64 %210 = llvm.add %208, %209 : i64 %211 = llvm.mul %86, %10 : i64 %212 = llvm.add %210, %211 : i64 %213 = llvm.add %212, %89 : i64 %214 = llvm.getelementptr %56[%213] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %215 = llvm.load %214 : !llvm.ptr -> f32 %216 = llvm.insertelement %215, %1[%0 : i32] : vector<3xf32> %217 = llvm.shufflevector %216, %1 [0, 0, 0] : vector<3xf32> %218 = llvm.intr.fmuladd(%151, %217, %207) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %219 = llvm.insertvalue %218, %7[0] : !llvm.array<1 x vector<3xf32>> %220 = llvm.insertvalue %219, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %221 = llvm.add %89, %13 : i64 llvm.br ^bb4(%221, %220 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %222 = llvm.add %86, %13 : i64 llvm.br ^bb3(%222, %90 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %223 = llvm.extractvalue %87[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %224 = llvm.mul %82, %3 : i64 %225 = llvm.mul %81, %4 : i64 %226 = llvm.add %224, %225 : i64 %227 = llvm.mul %78, %12 : i64 %228 = llvm.add %226, %227 : i64 %229 = llvm.add %228, %84 : i64 %230 = llvm.getelementptr %63[%229] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %223, %230 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %231 = llvm.add %84, %11 : i64 llvm.br ^bb2(%231 : i64) ^bb8: // pred: ^bb2 %232 = llvm.add %82, %13 : i64 llvm.br ^bb1(%232 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } // -----// IR Dump Before ReconcileUnrealizedCasts (reconcile-unrealized-casts) //----- // module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %22 = llvm.extractvalue %21[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %23 = llvm.getelementptr %22[3] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %26 = llvm.extractvalue %25[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %27 = llvm.getelementptr %26[4] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %30 = llvm.extractvalue %29[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %31 = llvm.getelementptr %30[5] : (!llvm.ptr) -> !llvm.ptr, i32 %32 = llvm.load %31 : !llvm.ptr -> i32 %33 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %34 = llvm.extractvalue %33[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %35 = llvm.getelementptr %34[6] : (!llvm.ptr) -> !llvm.ptr, i32 %36 = llvm.load %35 : !llvm.ptr -> i32 %37 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %38 = llvm.extractvalue %37[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %39 = llvm.getelementptr %38[7] : (!llvm.ptr) -> !llvm.ptr, i32 %40 = llvm.load %39 : !llvm.ptr -> i32 %41 = llvm.zext %20 : i32 to i64 %42 = llvm.zext %24 : i32 to i64 %43 = llvm.shl %42, %16 : i64 %44 = llvm.or %41, %43 : i64 %45 = llvm.zext %28 : i32 to i64 %46 = llvm.zext %32 : i32 to i64 %47 = llvm.shl %46, %16 : i64 %48 = llvm.or %45, %47 : i64 %49 = llvm.zext %36 : i32 to i64 %50 = llvm.zext %40 : i32 to i64 %51 = llvm.shl %50, %16 : i64 %52 = llvm.or %49, %51 : i64 %53 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %54 = llvm.extractvalue %53[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %55 = llvm.getelementptr %54[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %56 = llvm.load %55 : !llvm.ptr -> !llvm.ptr %57 = llvm.ptrtoint %56 : !llvm.ptr to i64 %58 = llvm.and %57, %4 : i64 %59 = llvm.icmp "eq" %58, %15 : i64 "llvm.intr.assume"(%59) : (i1) -> () %60 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %61 = llvm.extractvalue %60[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %62 = llvm.getelementptr %61[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %63 = llvm.load %62 : !llvm.ptr -> !llvm.ptr %64 = llvm.ptrtoint %63 : !llvm.ptr to i64 %65 = llvm.and %64, %4 : i64 %66 = llvm.icmp "eq" %65, %15 : i64 "llvm.intr.assume"(%66) : (i1) -> () %67 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %68 = llvm.extractvalue %67[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %69 = llvm.load %68 : !llvm.ptr -> !llvm.ptr %70 = llvm.mul %52, %2 : i64 %71 = llvm.mul %70, %48 : i64 %72 = llvm.mul %71, %44 : i64 %73 = llvm.ptrtoint %69 : !llvm.ptr to i64 %74 = llvm.and %73, %4 : i64 %75 = llvm.icmp "eq" %74, %15 : i64 "llvm.intr.assume"(%75) : (i1) -> () %76 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %77 = llvm.extractvalue %76[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %78 = llvm.zext %77 : i32 to i64 %79 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %80 = llvm.extractvalue %79[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %81 = llvm.zext %80 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%82: i64): // 2 preds: ^bb0, ^bb8 %83 = llvm.icmp "slt" %82, %14 : i64 llvm.cond_br %83, ^bb2(%15 : i64), ^bb9 ^bb2(%84: i64): // 2 preds: ^bb1, ^bb7 %85 = llvm.icmp "slt" %84, %12 : i64 llvm.cond_br %85, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%86: i64, %87: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %88 = llvm.icmp "slt" %86, %10 : i64 llvm.cond_br %88, ^bb4(%15, %87 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%89: i64, %90: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %91 = llvm.icmp "slt" %89, %10 : i64 llvm.cond_br %91, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %92 = llvm.add %78, %86 : i64 %93 = llvm.add %84, %89 : i64 %94 = llvm.mul %82, %72 : i64 %95 = llvm.mul %71, %15 : i64 %96 = llvm.add %94, %95 : i64 %97 = llvm.mul %92, %70 : i64 %98 = llvm.add %96, %97 : i64 %99 = llvm.add %98, %93 : i64 %100 = llvm.getelementptr %69[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %78, %86 : i64 %103 = llvm.add %102, %13 : i64 %104 = llvm.mul %82, %72 : i64 %105 = llvm.mul %71, %15 : i64 %106 = llvm.add %104, %105 : i64 %107 = llvm.mul %103, %70 : i64 %108 = llvm.add %106, %107 : i64 %109 = llvm.add %108, %93 : i64 %110 = llvm.getelementptr %69[%109] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %111 = llvm.load %110 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %112 = llvm.add %78, %86 : i64 %113 = llvm.add %112, %14 : i64 %114 = llvm.mul %82, %72 : i64 %115 = llvm.mul %71, %15 : i64 %116 = llvm.add %114, %115 : i64 %117 = llvm.mul %113, %70 : i64 %118 = llvm.add %116, %117 : i64 %119 = llvm.add %118, %93 : i64 %120 = llvm.getelementptr %69[%119] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %121 = llvm.load %120 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %122 = llvm.add %78, %86 : i64 %123 = llvm.add %122, %11 : i64 %124 = llvm.mul %82, %72 : i64 %125 = llvm.mul %71, %15 : i64 %126 = llvm.add %124, %125 : i64 %127 = llvm.mul %123, %70 : i64 %128 = llvm.add %126, %127 : i64 %129 = llvm.add %128, %93 : i64 %130 = llvm.getelementptr %69[%129] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %131 = llvm.load %130 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %132 = llvm.add %78, %86 : i64 %133 = llvm.add %132, %8 : i64 %134 = llvm.mul %82, %72 : i64 %135 = llvm.mul %71, %15 : i64 %136 = llvm.add %134, %135 : i64 %137 = llvm.mul %133, %70 : i64 %138 = llvm.add %136, %137 : i64 %139 = llvm.add %138, %93 : i64 %140 = llvm.getelementptr %69[%139] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %141 = llvm.load %140 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %142 = llvm.add %78, %86 : i64 %143 = llvm.add %142, %10 : i64 %144 = llvm.mul %82, %72 : i64 %145 = llvm.mul %71, %15 : i64 %146 = llvm.add %144, %145 : i64 %147 = llvm.mul %143, %70 : i64 %148 = llvm.add %146, %147 : i64 %149 = llvm.add %148, %93 : i64 %150 = llvm.getelementptr %69[%149] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %151 = llvm.load %150 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %152 = llvm.extractvalue %90[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %153 = llvm.mul %81, %6 : i64 %154 = llvm.mul %15, %5 : i64 %155 = llvm.add %153, %154 : i64 %156 = llvm.mul %86, %10 : i64 %157 = llvm.add %155, %156 : i64 %158 = llvm.add %157, %89 : i64 %159 = llvm.getelementptr %56[%158] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %160 = llvm.load %159 : !llvm.ptr -> f32 %161 = llvm.insertelement %160, %1[%0 : i32] : vector<3xf32> %162 = llvm.shufflevector %161, %1 [0, 0, 0] : vector<3xf32> %163 = llvm.intr.fmuladd(%101, %162, %152) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %164 = llvm.mul %81, %6 : i64 %165 = llvm.mul %13, %5 : i64 %166 = llvm.add %164, %165 : i64 %167 = llvm.mul %86, %10 : i64 %168 = llvm.add %166, %167 : i64 %169 = llvm.add %168, %89 : i64 %170 = llvm.getelementptr %56[%169] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %171 = llvm.load %170 : !llvm.ptr -> f32 %172 = llvm.insertelement %171, %1[%0 : i32] : vector<3xf32> %173 = llvm.shufflevector %172, %1 [0, 0, 0] : vector<3xf32> %174 = llvm.intr.fmuladd(%111, %173, %163) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %175 = llvm.mul %81, %6 : i64 %176 = llvm.mul %14, %5 : i64 %177 = llvm.add %175, %176 : i64 %178 = llvm.mul %86, %10 : i64 %179 = llvm.add %177, %178 : i64 %180 = llvm.add %179, %89 : i64 %181 = llvm.getelementptr %56[%180] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %182 = llvm.load %181 : !llvm.ptr -> f32 %183 = llvm.insertelement %182, %1[%0 : i32] : vector<3xf32> %184 = llvm.shufflevector %183, %1 [0, 0, 0] : vector<3xf32> %185 = llvm.intr.fmuladd(%121, %184, %174) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %186 = llvm.mul %81, %6 : i64 %187 = llvm.mul %11, %5 : i64 %188 = llvm.add %186, %187 : i64 %189 = llvm.mul %86, %10 : i64 %190 = llvm.add %188, %189 : i64 %191 = llvm.add %190, %89 : i64 %192 = llvm.getelementptr %56[%191] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %193 = llvm.load %192 : !llvm.ptr -> f32 %194 = llvm.insertelement %193, %1[%0 : i32] : vector<3xf32> %195 = llvm.shufflevector %194, %1 [0, 0, 0] : vector<3xf32> %196 = llvm.intr.fmuladd(%131, %195, %185) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %197 = llvm.mul %81, %6 : i64 %198 = llvm.mul %8, %5 : i64 %199 = llvm.add %197, %198 : i64 %200 = llvm.mul %86, %10 : i64 %201 = llvm.add %199, %200 : i64 %202 = llvm.add %201, %89 : i64 %203 = llvm.getelementptr %56[%202] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %204 = llvm.load %203 : !llvm.ptr -> f32 %205 = llvm.insertelement %204, %1[%0 : i32] : vector<3xf32> %206 = llvm.shufflevector %205, %1 [0, 0, 0] : vector<3xf32> %207 = llvm.intr.fmuladd(%141, %206, %196) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %208 = llvm.mul %81, %6 : i64 %209 = llvm.mul %10, %5 : i64 %210 = llvm.add %208, %209 : i64 %211 = llvm.mul %86, %10 : i64 %212 = llvm.add %210, %211 : i64 %213 = llvm.add %212, %89 : i64 %214 = llvm.getelementptr %56[%213] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %215 = llvm.load %214 : !llvm.ptr -> f32 %216 = llvm.insertelement %215, %1[%0 : i32] : vector<3xf32> %217 = llvm.shufflevector %216, %1 [0, 0, 0] : vector<3xf32> %218 = llvm.intr.fmuladd(%151, %217, %207) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %219 = llvm.insertvalue %218, %7[0] : !llvm.array<1 x vector<3xf32>> %220 = llvm.insertvalue %219, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %221 = llvm.add %89, %13 : i64 llvm.br ^bb4(%221, %220 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %222 = llvm.add %86, %13 : i64 llvm.br ^bb3(%222, %90 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %223 = llvm.extractvalue %87[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %224 = llvm.mul %82, %3 : i64 %225 = llvm.mul %81, %4 : i64 %226 = llvm.add %224, %225 : i64 %227 = llvm.mul %78, %12 : i64 %228 = llvm.add %226, %227 : i64 %229 = llvm.add %228, %84 : i64 %230 = llvm.getelementptr %63[%229] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %223, %230 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %231 = llvm.add %84, %11 : i64 llvm.br ^bb2(%231 : i64) ^bb8: // pred: ^bb2 %232 = llvm.add %82, %13 : i64 llvm.br ^bb1(%232 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } // -----// IR Dump After ReconcileUnrealizedCasts (reconcile-unrealized-casts) //----- // module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %22 = llvm.extractvalue %21[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %23 = llvm.getelementptr %22[3] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %26 = llvm.extractvalue %25[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %27 = llvm.getelementptr %26[4] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %30 = llvm.extractvalue %29[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %31 = llvm.getelementptr %30[5] : (!llvm.ptr) -> !llvm.ptr, i32 %32 = llvm.load %31 : !llvm.ptr -> i32 %33 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %34 = llvm.extractvalue %33[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %35 = llvm.getelementptr %34[6] : (!llvm.ptr) -> !llvm.ptr, i32 %36 = llvm.load %35 : !llvm.ptr -> i32 %37 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %38 = llvm.extractvalue %37[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %39 = llvm.getelementptr %38[7] : (!llvm.ptr) -> !llvm.ptr, i32 %40 = llvm.load %39 : !llvm.ptr -> i32 %41 = llvm.zext %20 : i32 to i64 %42 = llvm.zext %24 : i32 to i64 %43 = llvm.shl %42, %16 : i64 %44 = llvm.or %41, %43 : i64 %45 = llvm.zext %28 : i32 to i64 %46 = llvm.zext %32 : i32 to i64 %47 = llvm.shl %46, %16 : i64 %48 = llvm.or %45, %47 : i64 %49 = llvm.zext %36 : i32 to i64 %50 = llvm.zext %40 : i32 to i64 %51 = llvm.shl %50, %16 : i64 %52 = llvm.or %49, %51 : i64 %53 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %54 = llvm.extractvalue %53[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %55 = llvm.getelementptr %54[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %56 = llvm.load %55 : !llvm.ptr -> !llvm.ptr %57 = llvm.ptrtoint %56 : !llvm.ptr to i64 %58 = llvm.and %57, %4 : i64 %59 = llvm.icmp "eq" %58, %15 : i64 "llvm.intr.assume"(%59) : (i1) -> () %60 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %61 = llvm.extractvalue %60[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %62 = llvm.getelementptr %61[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %63 = llvm.load %62 : !llvm.ptr -> !llvm.ptr %64 = llvm.ptrtoint %63 : !llvm.ptr to i64 %65 = llvm.and %64, %4 : i64 %66 = llvm.icmp "eq" %65, %15 : i64 "llvm.intr.assume"(%66) : (i1) -> () %67 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %68 = llvm.extractvalue %67[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %69 = llvm.load %68 : !llvm.ptr -> !llvm.ptr %70 = llvm.mul %52, %2 : i64 %71 = llvm.mul %70, %48 : i64 %72 = llvm.mul %71, %44 : i64 %73 = llvm.ptrtoint %69 : !llvm.ptr to i64 %74 = llvm.and %73, %4 : i64 %75 = llvm.icmp "eq" %74, %15 : i64 "llvm.intr.assume"(%75) : (i1) -> () %76 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %77 = llvm.extractvalue %76[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %78 = llvm.zext %77 : i32 to i64 %79 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %80 = llvm.extractvalue %79[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %81 = llvm.zext %80 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%82: i64): // 2 preds: ^bb0, ^bb8 %83 = llvm.icmp "slt" %82, %14 : i64 llvm.cond_br %83, ^bb2(%15 : i64), ^bb9 ^bb2(%84: i64): // 2 preds: ^bb1, ^bb7 %85 = llvm.icmp "slt" %84, %12 : i64 llvm.cond_br %85, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%86: i64, %87: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %88 = llvm.icmp "slt" %86, %10 : i64 llvm.cond_br %88, ^bb4(%15, %87 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%89: i64, %90: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %91 = llvm.icmp "slt" %89, %10 : i64 llvm.cond_br %91, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %92 = llvm.add %78, %86 : i64 %93 = llvm.add %84, %89 : i64 %94 = llvm.mul %82, %72 : i64 %95 = llvm.mul %71, %15 : i64 %96 = llvm.add %94, %95 : i64 %97 = llvm.mul %92, %70 : i64 %98 = llvm.add %96, %97 : i64 %99 = llvm.add %98, %93 : i64 %100 = llvm.getelementptr %69[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %78, %86 : i64 %103 = llvm.add %102, %13 : i64 %104 = llvm.mul %82, %72 : i64 %105 = llvm.mul %71, %15 : i64 %106 = llvm.add %104, %105 : i64 %107 = llvm.mul %103, %70 : i64 %108 = llvm.add %106, %107 : i64 %109 = llvm.add %108, %93 : i64 %110 = llvm.getelementptr %69[%109] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %111 = llvm.load %110 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %112 = llvm.add %78, %86 : i64 %113 = llvm.add %112, %14 : i64 %114 = llvm.mul %82, %72 : i64 %115 = llvm.mul %71, %15 : i64 %116 = llvm.add %114, %115 : i64 %117 = llvm.mul %113, %70 : i64 %118 = llvm.add %116, %117 : i64 %119 = llvm.add %118, %93 : i64 %120 = llvm.getelementptr %69[%119] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %121 = llvm.load %120 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %122 = llvm.add %78, %86 : i64 %123 = llvm.add %122, %11 : i64 %124 = llvm.mul %82, %72 : i64 %125 = llvm.mul %71, %15 : i64 %126 = llvm.add %124, %125 : i64 %127 = llvm.mul %123, %70 : i64 %128 = llvm.add %126, %127 : i64 %129 = llvm.add %128, %93 : i64 %130 = llvm.getelementptr %69[%129] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %131 = llvm.load %130 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %132 = llvm.add %78, %86 : i64 %133 = llvm.add %132, %8 : i64 %134 = llvm.mul %82, %72 : i64 %135 = llvm.mul %71, %15 : i64 %136 = llvm.add %134, %135 : i64 %137 = llvm.mul %133, %70 : i64 %138 = llvm.add %136, %137 : i64 %139 = llvm.add %138, %93 : i64 %140 = llvm.getelementptr %69[%139] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %141 = llvm.load %140 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %142 = llvm.add %78, %86 : i64 %143 = llvm.add %142, %10 : i64 %144 = llvm.mul %82, %72 : i64 %145 = llvm.mul %71, %15 : i64 %146 = llvm.add %144, %145 : i64 %147 = llvm.mul %143, %70 : i64 %148 = llvm.add %146, %147 : i64 %149 = llvm.add %148, %93 : i64 %150 = llvm.getelementptr %69[%149] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %151 = llvm.load %150 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %152 = llvm.extractvalue %90[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %153 = llvm.mul %81, %6 : i64 %154 = llvm.mul %15, %5 : i64 %155 = llvm.add %153, %154 : i64 %156 = llvm.mul %86, %10 : i64 %157 = llvm.add %155, %156 : i64 %158 = llvm.add %157, %89 : i64 %159 = llvm.getelementptr %56[%158] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %160 = llvm.load %159 : !llvm.ptr -> f32 %161 = llvm.insertelement %160, %1[%0 : i32] : vector<3xf32> %162 = llvm.shufflevector %161, %1 [0, 0, 0] : vector<3xf32> %163 = llvm.intr.fmuladd(%101, %162, %152) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %164 = llvm.mul %81, %6 : i64 %165 = llvm.mul %13, %5 : i64 %166 = llvm.add %164, %165 : i64 %167 = llvm.mul %86, %10 : i64 %168 = llvm.add %166, %167 : i64 %169 = llvm.add %168, %89 : i64 %170 = llvm.getelementptr %56[%169] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %171 = llvm.load %170 : !llvm.ptr -> f32 %172 = llvm.insertelement %171, %1[%0 : i32] : vector<3xf32> %173 = llvm.shufflevector %172, %1 [0, 0, 0] : vector<3xf32> %174 = llvm.intr.fmuladd(%111, %173, %163) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %175 = llvm.mul %81, %6 : i64 %176 = llvm.mul %14, %5 : i64 %177 = llvm.add %175, %176 : i64 %178 = llvm.mul %86, %10 : i64 %179 = llvm.add %177, %178 : i64 %180 = llvm.add %179, %89 : i64 %181 = llvm.getelementptr %56[%180] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %182 = llvm.load %181 : !llvm.ptr -> f32 %183 = llvm.insertelement %182, %1[%0 : i32] : vector<3xf32> %184 = llvm.shufflevector %183, %1 [0, 0, 0] : vector<3xf32> %185 = llvm.intr.fmuladd(%121, %184, %174) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %186 = llvm.mul %81, %6 : i64 %187 = llvm.mul %11, %5 : i64 %188 = llvm.add %186, %187 : i64 %189 = llvm.mul %86, %10 : i64 %190 = llvm.add %188, %189 : i64 %191 = llvm.add %190, %89 : i64 %192 = llvm.getelementptr %56[%191] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %193 = llvm.load %192 : !llvm.ptr -> f32 %194 = llvm.insertelement %193, %1[%0 : i32] : vector<3xf32> %195 = llvm.shufflevector %194, %1 [0, 0, 0] : vector<3xf32> %196 = llvm.intr.fmuladd(%131, %195, %185) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %197 = llvm.mul %81, %6 : i64 %198 = llvm.mul %8, %5 : i64 %199 = llvm.add %197, %198 : i64 %200 = llvm.mul %86, %10 : i64 %201 = llvm.add %199, %200 : i64 %202 = llvm.add %201, %89 : i64 %203 = llvm.getelementptr %56[%202] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %204 = llvm.load %203 : !llvm.ptr -> f32 %205 = llvm.insertelement %204, %1[%0 : i32] : vector<3xf32> %206 = llvm.shufflevector %205, %1 [0, 0, 0] : vector<3xf32> %207 = llvm.intr.fmuladd(%141, %206, %196) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %208 = llvm.mul %81, %6 : i64 %209 = llvm.mul %10, %5 : i64 %210 = llvm.add %208, %209 : i64 %211 = llvm.mul %86, %10 : i64 %212 = llvm.add %210, %211 : i64 %213 = llvm.add %212, %89 : i64 %214 = llvm.getelementptr %56[%213] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %215 = llvm.load %214 : !llvm.ptr -> f32 %216 = llvm.insertelement %215, %1[%0 : i32] : vector<3xf32> %217 = llvm.shufflevector %216, %1 [0, 0, 0] : vector<3xf32> %218 = llvm.intr.fmuladd(%151, %217, %207) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %219 = llvm.insertvalue %218, %7[0] : !llvm.array<1 x vector<3xf32>> %220 = llvm.insertvalue %219, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %221 = llvm.add %89, %13 : i64 llvm.br ^bb4(%221, %220 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %222 = llvm.add %86, %13 : i64 llvm.br ^bb3(%222, %90 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %223 = llvm.extractvalue %87[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %224 = llvm.mul %82, %3 : i64 %225 = llvm.mul %81, %4 : i64 %226 = llvm.add %224, %225 : i64 %227 = llvm.mul %78, %12 : i64 %228 = llvm.add %226, %227 : i64 %229 = llvm.add %228, %84 : i64 %230 = llvm.getelementptr %63[%229] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %223, %230 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %231 = llvm.add %84, %11 : i64 llvm.br ^bb2(%231 : i64) ^bb8: // pred: ^bb2 %232 = llvm.add %82, %13 : i64 llvm.br ^bb1(%232 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } // -----// IR Dump Before LLVMCPUSynchronizeSymbolVisibilityPass (iree-llvmcpu-synchronize-symbol-visibility) //----- // module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %22 = llvm.extractvalue %21[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %23 = llvm.getelementptr %22[3] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %26 = llvm.extractvalue %25[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %27 = llvm.getelementptr %26[4] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %30 = llvm.extractvalue %29[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %31 = llvm.getelementptr %30[5] : (!llvm.ptr) -> !llvm.ptr, i32 %32 = llvm.load %31 : !llvm.ptr -> i32 %33 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %34 = llvm.extractvalue %33[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %35 = llvm.getelementptr %34[6] : (!llvm.ptr) -> !llvm.ptr, i32 %36 = llvm.load %35 : !llvm.ptr -> i32 %37 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %38 = llvm.extractvalue %37[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %39 = llvm.getelementptr %38[7] : (!llvm.ptr) -> !llvm.ptr, i32 %40 = llvm.load %39 : !llvm.ptr -> i32 %41 = llvm.zext %20 : i32 to i64 %42 = llvm.zext %24 : i32 to i64 %43 = llvm.shl %42, %16 : i64 %44 = llvm.or %41, %43 : i64 %45 = llvm.zext %28 : i32 to i64 %46 = llvm.zext %32 : i32 to i64 %47 = llvm.shl %46, %16 : i64 %48 = llvm.or %45, %47 : i64 %49 = llvm.zext %36 : i32 to i64 %50 = llvm.zext %40 : i32 to i64 %51 = llvm.shl %50, %16 : i64 %52 = llvm.or %49, %51 : i64 %53 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %54 = llvm.extractvalue %53[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %55 = llvm.getelementptr %54[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %56 = llvm.load %55 : !llvm.ptr -> !llvm.ptr %57 = llvm.ptrtoint %56 : !llvm.ptr to i64 %58 = llvm.and %57, %4 : i64 %59 = llvm.icmp "eq" %58, %15 : i64 "llvm.intr.assume"(%59) : (i1) -> () %60 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %61 = llvm.extractvalue %60[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %62 = llvm.getelementptr %61[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %63 = llvm.load %62 : !llvm.ptr -> !llvm.ptr %64 = llvm.ptrtoint %63 : !llvm.ptr to i64 %65 = llvm.and %64, %4 : i64 %66 = llvm.icmp "eq" %65, %15 : i64 "llvm.intr.assume"(%66) : (i1) -> () %67 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %68 = llvm.extractvalue %67[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %69 = llvm.load %68 : !llvm.ptr -> !llvm.ptr %70 = llvm.mul %52, %2 : i64 %71 = llvm.mul %70, %48 : i64 %72 = llvm.mul %71, %44 : i64 %73 = llvm.ptrtoint %69 : !llvm.ptr to i64 %74 = llvm.and %73, %4 : i64 %75 = llvm.icmp "eq" %74, %15 : i64 "llvm.intr.assume"(%75) : (i1) -> () %76 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %77 = llvm.extractvalue %76[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %78 = llvm.zext %77 : i32 to i64 %79 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %80 = llvm.extractvalue %79[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %81 = llvm.zext %80 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%82: i64): // 2 preds: ^bb0, ^bb8 %83 = llvm.icmp "slt" %82, %14 : i64 llvm.cond_br %83, ^bb2(%15 : i64), ^bb9 ^bb2(%84: i64): // 2 preds: ^bb1, ^bb7 %85 = llvm.icmp "slt" %84, %12 : i64 llvm.cond_br %85, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%86: i64, %87: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %88 = llvm.icmp "slt" %86, %10 : i64 llvm.cond_br %88, ^bb4(%15, %87 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%89: i64, %90: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %91 = llvm.icmp "slt" %89, %10 : i64 llvm.cond_br %91, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %92 = llvm.add %78, %86 : i64 %93 = llvm.add %84, %89 : i64 %94 = llvm.mul %82, %72 : i64 %95 = llvm.mul %71, %15 : i64 %96 = llvm.add %94, %95 : i64 %97 = llvm.mul %92, %70 : i64 %98 = llvm.add %96, %97 : i64 %99 = llvm.add %98, %93 : i64 %100 = llvm.getelementptr %69[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %78, %86 : i64 %103 = llvm.add %102, %13 : i64 %104 = llvm.mul %82, %72 : i64 %105 = llvm.mul %71, %15 : i64 %106 = llvm.add %104, %105 : i64 %107 = llvm.mul %103, %70 : i64 %108 = llvm.add %106, %107 : i64 %109 = llvm.add %108, %93 : i64 %110 = llvm.getelementptr %69[%109] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %111 = llvm.load %110 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %112 = llvm.add %78, %86 : i64 %113 = llvm.add %112, %14 : i64 %114 = llvm.mul %82, %72 : i64 %115 = llvm.mul %71, %15 : i64 %116 = llvm.add %114, %115 : i64 %117 = llvm.mul %113, %70 : i64 %118 = llvm.add %116, %117 : i64 %119 = llvm.add %118, %93 : i64 %120 = llvm.getelementptr %69[%119] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %121 = llvm.load %120 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %122 = llvm.add %78, %86 : i64 %123 = llvm.add %122, %11 : i64 %124 = llvm.mul %82, %72 : i64 %125 = llvm.mul %71, %15 : i64 %126 = llvm.add %124, %125 : i64 %127 = llvm.mul %123, %70 : i64 %128 = llvm.add %126, %127 : i64 %129 = llvm.add %128, %93 : i64 %130 = llvm.getelementptr %69[%129] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %131 = llvm.load %130 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %132 = llvm.add %78, %86 : i64 %133 = llvm.add %132, %8 : i64 %134 = llvm.mul %82, %72 : i64 %135 = llvm.mul %71, %15 : i64 %136 = llvm.add %134, %135 : i64 %137 = llvm.mul %133, %70 : i64 %138 = llvm.add %136, %137 : i64 %139 = llvm.add %138, %93 : i64 %140 = llvm.getelementptr %69[%139] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %141 = llvm.load %140 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %142 = llvm.add %78, %86 : i64 %143 = llvm.add %142, %10 : i64 %144 = llvm.mul %82, %72 : i64 %145 = llvm.mul %71, %15 : i64 %146 = llvm.add %144, %145 : i64 %147 = llvm.mul %143, %70 : i64 %148 = llvm.add %146, %147 : i64 %149 = llvm.add %148, %93 : i64 %150 = llvm.getelementptr %69[%149] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %151 = llvm.load %150 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %152 = llvm.extractvalue %90[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %153 = llvm.mul %81, %6 : i64 %154 = llvm.mul %15, %5 : i64 %155 = llvm.add %153, %154 : i64 %156 = llvm.mul %86, %10 : i64 %157 = llvm.add %155, %156 : i64 %158 = llvm.add %157, %89 : i64 %159 = llvm.getelementptr %56[%158] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %160 = llvm.load %159 : !llvm.ptr -> f32 %161 = llvm.insertelement %160, %1[%0 : i32] : vector<3xf32> %162 = llvm.shufflevector %161, %1 [0, 0, 0] : vector<3xf32> %163 = llvm.intr.fmuladd(%101, %162, %152) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %164 = llvm.mul %81, %6 : i64 %165 = llvm.mul %13, %5 : i64 %166 = llvm.add %164, %165 : i64 %167 = llvm.mul %86, %10 : i64 %168 = llvm.add %166, %167 : i64 %169 = llvm.add %168, %89 : i64 %170 = llvm.getelementptr %56[%169] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %171 = llvm.load %170 : !llvm.ptr -> f32 %172 = llvm.insertelement %171, %1[%0 : i32] : vector<3xf32> %173 = llvm.shufflevector %172, %1 [0, 0, 0] : vector<3xf32> %174 = llvm.intr.fmuladd(%111, %173, %163) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %175 = llvm.mul %81, %6 : i64 %176 = llvm.mul %14, %5 : i64 %177 = llvm.add %175, %176 : i64 %178 = llvm.mul %86, %10 : i64 %179 = llvm.add %177, %178 : i64 %180 = llvm.add %179, %89 : i64 %181 = llvm.getelementptr %56[%180] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %182 = llvm.load %181 : !llvm.ptr -> f32 %183 = llvm.insertelement %182, %1[%0 : i32] : vector<3xf32> %184 = llvm.shufflevector %183, %1 [0, 0, 0] : vector<3xf32> %185 = llvm.intr.fmuladd(%121, %184, %174) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %186 = llvm.mul %81, %6 : i64 %187 = llvm.mul %11, %5 : i64 %188 = llvm.add %186, %187 : i64 %189 = llvm.mul %86, %10 : i64 %190 = llvm.add %188, %189 : i64 %191 = llvm.add %190, %89 : i64 %192 = llvm.getelementptr %56[%191] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %193 = llvm.load %192 : !llvm.ptr -> f32 %194 = llvm.insertelement %193, %1[%0 : i32] : vector<3xf32> %195 = llvm.shufflevector %194, %1 [0, 0, 0] : vector<3xf32> %196 = llvm.intr.fmuladd(%131, %195, %185) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %197 = llvm.mul %81, %6 : i64 %198 = llvm.mul %8, %5 : i64 %199 = llvm.add %197, %198 : i64 %200 = llvm.mul %86, %10 : i64 %201 = llvm.add %199, %200 : i64 %202 = llvm.add %201, %89 : i64 %203 = llvm.getelementptr %56[%202] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %204 = llvm.load %203 : !llvm.ptr -> f32 %205 = llvm.insertelement %204, %1[%0 : i32] : vector<3xf32> %206 = llvm.shufflevector %205, %1 [0, 0, 0] : vector<3xf32> %207 = llvm.intr.fmuladd(%141, %206, %196) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %208 = llvm.mul %81, %6 : i64 %209 = llvm.mul %10, %5 : i64 %210 = llvm.add %208, %209 : i64 %211 = llvm.mul %86, %10 : i64 %212 = llvm.add %210, %211 : i64 %213 = llvm.add %212, %89 : i64 %214 = llvm.getelementptr %56[%213] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %215 = llvm.load %214 : !llvm.ptr -> f32 %216 = llvm.insertelement %215, %1[%0 : i32] : vector<3xf32> %217 = llvm.shufflevector %216, %1 [0, 0, 0] : vector<3xf32> %218 = llvm.intr.fmuladd(%151, %217, %207) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %219 = llvm.insertvalue %218, %7[0] : !llvm.array<1 x vector<3xf32>> %220 = llvm.insertvalue %219, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %221 = llvm.add %89, %13 : i64 llvm.br ^bb4(%221, %220 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %222 = llvm.add %86, %13 : i64 llvm.br ^bb3(%222, %90 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %223 = llvm.extractvalue %87[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %224 = llvm.mul %82, %3 : i64 %225 = llvm.mul %81, %4 : i64 %226 = llvm.add %224, %225 : i64 %227 = llvm.mul %78, %12 : i64 %228 = llvm.add %226, %227 : i64 %229 = llvm.add %228, %84 : i64 %230 = llvm.getelementptr %63[%229] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %223, %230 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %231 = llvm.add %84, %11 : i64 llvm.br ^bb2(%231 : i64) ^bb8: // pred: ^bb2 %232 = llvm.add %82, %13 : i64 llvm.br ^bb1(%232 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } // -----// IR Dump After LLVMCPUSynchronizeSymbolVisibilityPass (iree-llvmcpu-synchronize-symbol-visibility) //----- // module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %22 = llvm.extractvalue %21[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %23 = llvm.getelementptr %22[3] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %26 = llvm.extractvalue %25[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %27 = llvm.getelementptr %26[4] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %30 = llvm.extractvalue %29[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %31 = llvm.getelementptr %30[5] : (!llvm.ptr) -> !llvm.ptr, i32 %32 = llvm.load %31 : !llvm.ptr -> i32 %33 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %34 = llvm.extractvalue %33[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %35 = llvm.getelementptr %34[6] : (!llvm.ptr) -> !llvm.ptr, i32 %36 = llvm.load %35 : !llvm.ptr -> i32 %37 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %38 = llvm.extractvalue %37[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %39 = llvm.getelementptr %38[7] : (!llvm.ptr) -> !llvm.ptr, i32 %40 = llvm.load %39 : !llvm.ptr -> i32 %41 = llvm.zext %20 : i32 to i64 %42 = llvm.zext %24 : i32 to i64 %43 = llvm.shl %42, %16 : i64 %44 = llvm.or %41, %43 : i64 %45 = llvm.zext %28 : i32 to i64 %46 = llvm.zext %32 : i32 to i64 %47 = llvm.shl %46, %16 : i64 %48 = llvm.or %45, %47 : i64 %49 = llvm.zext %36 : i32 to i64 %50 = llvm.zext %40 : i32 to i64 %51 = llvm.shl %50, %16 : i64 %52 = llvm.or %49, %51 : i64 %53 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %54 = llvm.extractvalue %53[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %55 = llvm.getelementptr %54[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %56 = llvm.load %55 : !llvm.ptr -> !llvm.ptr %57 = llvm.ptrtoint %56 : !llvm.ptr to i64 %58 = llvm.and %57, %4 : i64 %59 = llvm.icmp "eq" %58, %15 : i64 "llvm.intr.assume"(%59) : (i1) -> () %60 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %61 = llvm.extractvalue %60[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %62 = llvm.getelementptr %61[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %63 = llvm.load %62 : !llvm.ptr -> !llvm.ptr %64 = llvm.ptrtoint %63 : !llvm.ptr to i64 %65 = llvm.and %64, %4 : i64 %66 = llvm.icmp "eq" %65, %15 : i64 "llvm.intr.assume"(%66) : (i1) -> () %67 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %68 = llvm.extractvalue %67[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %69 = llvm.load %68 : !llvm.ptr -> !llvm.ptr %70 = llvm.mul %52, %2 : i64 %71 = llvm.mul %70, %48 : i64 %72 = llvm.mul %71, %44 : i64 %73 = llvm.ptrtoint %69 : !llvm.ptr to i64 %74 = llvm.and %73, %4 : i64 %75 = llvm.icmp "eq" %74, %15 : i64 "llvm.intr.assume"(%75) : (i1) -> () %76 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %77 = llvm.extractvalue %76[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %78 = llvm.zext %77 : i32 to i64 %79 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %80 = llvm.extractvalue %79[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %81 = llvm.zext %80 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%82: i64): // 2 preds: ^bb0, ^bb8 %83 = llvm.icmp "slt" %82, %14 : i64 llvm.cond_br %83, ^bb2(%15 : i64), ^bb9 ^bb2(%84: i64): // 2 preds: ^bb1, ^bb7 %85 = llvm.icmp "slt" %84, %12 : i64 llvm.cond_br %85, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%86: i64, %87: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %88 = llvm.icmp "slt" %86, %10 : i64 llvm.cond_br %88, ^bb4(%15, %87 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%89: i64, %90: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %91 = llvm.icmp "slt" %89, %10 : i64 llvm.cond_br %91, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %92 = llvm.add %78, %86 : i64 %93 = llvm.add %84, %89 : i64 %94 = llvm.mul %82, %72 : i64 %95 = llvm.mul %71, %15 : i64 %96 = llvm.add %94, %95 : i64 %97 = llvm.mul %92, %70 : i64 %98 = llvm.add %96, %97 : i64 %99 = llvm.add %98, %93 : i64 %100 = llvm.getelementptr %69[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %78, %86 : i64 %103 = llvm.add %102, %13 : i64 %104 = llvm.mul %82, %72 : i64 %105 = llvm.mul %71, %15 : i64 %106 = llvm.add %104, %105 : i64 %107 = llvm.mul %103, %70 : i64 %108 = llvm.add %106, %107 : i64 %109 = llvm.add %108, %93 : i64 %110 = llvm.getelementptr %69[%109] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %111 = llvm.load %110 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %112 = llvm.add %78, %86 : i64 %113 = llvm.add %112, %14 : i64 %114 = llvm.mul %82, %72 : i64 %115 = llvm.mul %71, %15 : i64 %116 = llvm.add %114, %115 : i64 %117 = llvm.mul %113, %70 : i64 %118 = llvm.add %116, %117 : i64 %119 = llvm.add %118, %93 : i64 %120 = llvm.getelementptr %69[%119] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %121 = llvm.load %120 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %122 = llvm.add %78, %86 : i64 %123 = llvm.add %122, %11 : i64 %124 = llvm.mul %82, %72 : i64 %125 = llvm.mul %71, %15 : i64 %126 = llvm.add %124, %125 : i64 %127 = llvm.mul %123, %70 : i64 %128 = llvm.add %126, %127 : i64 %129 = llvm.add %128, %93 : i64 %130 = llvm.getelementptr %69[%129] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %131 = llvm.load %130 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %132 = llvm.add %78, %86 : i64 %133 = llvm.add %132, %8 : i64 %134 = llvm.mul %82, %72 : i64 %135 = llvm.mul %71, %15 : i64 %136 = llvm.add %134, %135 : i64 %137 = llvm.mul %133, %70 : i64 %138 = llvm.add %136, %137 : i64 %139 = llvm.add %138, %93 : i64 %140 = llvm.getelementptr %69[%139] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %141 = llvm.load %140 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %142 = llvm.add %78, %86 : i64 %143 = llvm.add %142, %10 : i64 %144 = llvm.mul %82, %72 : i64 %145 = llvm.mul %71, %15 : i64 %146 = llvm.add %144, %145 : i64 %147 = llvm.mul %143, %70 : i64 %148 = llvm.add %146, %147 : i64 %149 = llvm.add %148, %93 : i64 %150 = llvm.getelementptr %69[%149] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %151 = llvm.load %150 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %152 = llvm.extractvalue %90[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %153 = llvm.mul %81, %6 : i64 %154 = llvm.mul %15, %5 : i64 %155 = llvm.add %153, %154 : i64 %156 = llvm.mul %86, %10 : i64 %157 = llvm.add %155, %156 : i64 %158 = llvm.add %157, %89 : i64 %159 = llvm.getelementptr %56[%158] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %160 = llvm.load %159 : !llvm.ptr -> f32 %161 = llvm.insertelement %160, %1[%0 : i32] : vector<3xf32> %162 = llvm.shufflevector %161, %1 [0, 0, 0] : vector<3xf32> %163 = llvm.intr.fmuladd(%101, %162, %152) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %164 = llvm.mul %81, %6 : i64 %165 = llvm.mul %13, %5 : i64 %166 = llvm.add %164, %165 : i64 %167 = llvm.mul %86, %10 : i64 %168 = llvm.add %166, %167 : i64 %169 = llvm.add %168, %89 : i64 %170 = llvm.getelementptr %56[%169] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %171 = llvm.load %170 : !llvm.ptr -> f32 %172 = llvm.insertelement %171, %1[%0 : i32] : vector<3xf32> %173 = llvm.shufflevector %172, %1 [0, 0, 0] : vector<3xf32> %174 = llvm.intr.fmuladd(%111, %173, %163) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %175 = llvm.mul %81, %6 : i64 %176 = llvm.mul %14, %5 : i64 %177 = llvm.add %175, %176 : i64 %178 = llvm.mul %86, %10 : i64 %179 = llvm.add %177, %178 : i64 %180 = llvm.add %179, %89 : i64 %181 = llvm.getelementptr %56[%180] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %182 = llvm.load %181 : !llvm.ptr -> f32 %183 = llvm.insertelement %182, %1[%0 : i32] : vector<3xf32> %184 = llvm.shufflevector %183, %1 [0, 0, 0] : vector<3xf32> %185 = llvm.intr.fmuladd(%121, %184, %174) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %186 = llvm.mul %81, %6 : i64 %187 = llvm.mul %11, %5 : i64 %188 = llvm.add %186, %187 : i64 %189 = llvm.mul %86, %10 : i64 %190 = llvm.add %188, %189 : i64 %191 = llvm.add %190, %89 : i64 %192 = llvm.getelementptr %56[%191] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %193 = llvm.load %192 : !llvm.ptr -> f32 %194 = llvm.insertelement %193, %1[%0 : i32] : vector<3xf32> %195 = llvm.shufflevector %194, %1 [0, 0, 0] : vector<3xf32> %196 = llvm.intr.fmuladd(%131, %195, %185) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %197 = llvm.mul %81, %6 : i64 %198 = llvm.mul %8, %5 : i64 %199 = llvm.add %197, %198 : i64 %200 = llvm.mul %86, %10 : i64 %201 = llvm.add %199, %200 : i64 %202 = llvm.add %201, %89 : i64 %203 = llvm.getelementptr %56[%202] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %204 = llvm.load %203 : !llvm.ptr -> f32 %205 = llvm.insertelement %204, %1[%0 : i32] : vector<3xf32> %206 = llvm.shufflevector %205, %1 [0, 0, 0] : vector<3xf32> %207 = llvm.intr.fmuladd(%141, %206, %196) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %208 = llvm.mul %81, %6 : i64 %209 = llvm.mul %10, %5 : i64 %210 = llvm.add %208, %209 : i64 %211 = llvm.mul %86, %10 : i64 %212 = llvm.add %210, %211 : i64 %213 = llvm.add %212, %89 : i64 %214 = llvm.getelementptr %56[%213] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %215 = llvm.load %214 : !llvm.ptr -> f32 %216 = llvm.insertelement %215, %1[%0 : i32] : vector<3xf32> %217 = llvm.shufflevector %216, %1 [0, 0, 0] : vector<3xf32> %218 = llvm.intr.fmuladd(%151, %217, %207) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %219 = llvm.insertvalue %218, %7[0] : !llvm.array<1 x vector<3xf32>> %220 = llvm.insertvalue %219, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %221 = llvm.add %89, %13 : i64 llvm.br ^bb4(%221, %220 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %222 = llvm.add %86, %13 : i64 llvm.br ^bb3(%222, %90 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %223 = llvm.extractvalue %87[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %224 = llvm.mul %82, %3 : i64 %225 = llvm.mul %81, %4 : i64 %226 = llvm.add %224, %225 : i64 %227 = llvm.mul %78, %12 : i64 %228 = llvm.add %226, %227 : i64 %229 = llvm.add %228, %84 : i64 %230 = llvm.getelementptr %63[%229] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %223, %230 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %231 = llvm.add %84, %11 : i64 llvm.br ^bb2(%231 : i64) ^bb8: // pred: ^bb2 %232 = llvm.add %82, %13 : i64 llvm.br ^bb1(%232 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %22 = llvm.extractvalue %21[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %23 = llvm.getelementptr %22[3] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %26 = llvm.extractvalue %25[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %27 = llvm.getelementptr %26[4] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %30 = llvm.extractvalue %29[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %31 = llvm.getelementptr %30[5] : (!llvm.ptr) -> !llvm.ptr, i32 %32 = llvm.load %31 : !llvm.ptr -> i32 %33 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %34 = llvm.extractvalue %33[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %35 = llvm.getelementptr %34[6] : (!llvm.ptr) -> !llvm.ptr, i32 %36 = llvm.load %35 : !llvm.ptr -> i32 %37 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %38 = llvm.extractvalue %37[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %39 = llvm.getelementptr %38[7] : (!llvm.ptr) -> !llvm.ptr, i32 %40 = llvm.load %39 : !llvm.ptr -> i32 %41 = llvm.zext %20 : i32 to i64 %42 = llvm.zext %24 : i32 to i64 %43 = llvm.shl %42, %16 : i64 %44 = llvm.or %41, %43 : i64 %45 = llvm.zext %28 : i32 to i64 %46 = llvm.zext %32 : i32 to i64 %47 = llvm.shl %46, %16 : i64 %48 = llvm.or %45, %47 : i64 %49 = llvm.zext %36 : i32 to i64 %50 = llvm.zext %40 : i32 to i64 %51 = llvm.shl %50, %16 : i64 %52 = llvm.or %49, %51 : i64 %53 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %54 = llvm.extractvalue %53[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %55 = llvm.getelementptr %54[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %56 = llvm.load %55 : !llvm.ptr -> !llvm.ptr %57 = llvm.ptrtoint %56 : !llvm.ptr to i64 %58 = llvm.and %57, %4 : i64 %59 = llvm.icmp "eq" %58, %15 : i64 "llvm.intr.assume"(%59) : (i1) -> () %60 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %61 = llvm.extractvalue %60[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %62 = llvm.getelementptr %61[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %63 = llvm.load %62 : !llvm.ptr -> !llvm.ptr %64 = llvm.ptrtoint %63 : !llvm.ptr to i64 %65 = llvm.and %64, %4 : i64 %66 = llvm.icmp "eq" %65, %15 : i64 "llvm.intr.assume"(%66) : (i1) -> () %67 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %68 = llvm.extractvalue %67[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %69 = llvm.load %68 : !llvm.ptr -> !llvm.ptr %70 = llvm.mul %52, %2 : i64 %71 = llvm.mul %70, %48 : i64 %72 = llvm.mul %71, %44 : i64 %73 = llvm.ptrtoint %69 : !llvm.ptr to i64 %74 = llvm.and %73, %4 : i64 %75 = llvm.icmp "eq" %74, %15 : i64 "llvm.intr.assume"(%75) : (i1) -> () %76 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %77 = llvm.extractvalue %76[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %78 = llvm.zext %77 : i32 to i64 %79 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %80 = llvm.extractvalue %79[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %81 = llvm.zext %80 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%82: i64): // 2 preds: ^bb0, ^bb8 %83 = llvm.icmp "slt" %82, %14 : i64 llvm.cond_br %83, ^bb2(%15 : i64), ^bb9 ^bb2(%84: i64): // 2 preds: ^bb1, ^bb7 %85 = llvm.icmp "slt" %84, %12 : i64 llvm.cond_br %85, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%86: i64, %87: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %88 = llvm.icmp "slt" %86, %10 : i64 llvm.cond_br %88, ^bb4(%15, %87 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%89: i64, %90: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %91 = llvm.icmp "slt" %89, %10 : i64 llvm.cond_br %91, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %92 = llvm.add %78, %86 : i64 %93 = llvm.add %84, %89 : i64 %94 = llvm.mul %82, %72 : i64 %95 = llvm.mul %71, %15 : i64 %96 = llvm.add %94, %95 : i64 %97 = llvm.mul %92, %70 : i64 %98 = llvm.add %96, %97 : i64 %99 = llvm.add %98, %93 : i64 %100 = llvm.getelementptr %69[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %78, %86 : i64 %103 = llvm.add %102, %13 : i64 %104 = llvm.mul %82, %72 : i64 %105 = llvm.mul %71, %15 : i64 %106 = llvm.add %104, %105 : i64 %107 = llvm.mul %103, %70 : i64 %108 = llvm.add %106, %107 : i64 %109 = llvm.add %108, %93 : i64 %110 = llvm.getelementptr %69[%109] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %111 = llvm.load %110 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %112 = llvm.add %78, %86 : i64 %113 = llvm.add %112, %14 : i64 %114 = llvm.mul %82, %72 : i64 %115 = llvm.mul %71, %15 : i64 %116 = llvm.add %114, %115 : i64 %117 = llvm.mul %113, %70 : i64 %118 = llvm.add %116, %117 : i64 %119 = llvm.add %118, %93 : i64 %120 = llvm.getelementptr %69[%119] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %121 = llvm.load %120 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %122 = llvm.add %78, %86 : i64 %123 = llvm.add %122, %11 : i64 %124 = llvm.mul %82, %72 : i64 %125 = llvm.mul %71, %15 : i64 %126 = llvm.add %124, %125 : i64 %127 = llvm.mul %123, %70 : i64 %128 = llvm.add %126, %127 : i64 %129 = llvm.add %128, %93 : i64 %130 = llvm.getelementptr %69[%129] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %131 = llvm.load %130 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %132 = llvm.add %78, %86 : i64 %133 = llvm.add %132, %8 : i64 %134 = llvm.mul %82, %72 : i64 %135 = llvm.mul %71, %15 : i64 %136 = llvm.add %134, %135 : i64 %137 = llvm.mul %133, %70 : i64 %138 = llvm.add %136, %137 : i64 %139 = llvm.add %138, %93 : i64 %140 = llvm.getelementptr %69[%139] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %141 = llvm.load %140 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %142 = llvm.add %78, %86 : i64 %143 = llvm.add %142, %10 : i64 %144 = llvm.mul %82, %72 : i64 %145 = llvm.mul %71, %15 : i64 %146 = llvm.add %144, %145 : i64 %147 = llvm.mul %143, %70 : i64 %148 = llvm.add %146, %147 : i64 %149 = llvm.add %148, %93 : i64 %150 = llvm.getelementptr %69[%149] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %151 = llvm.load %150 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %152 = llvm.extractvalue %90[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %153 = llvm.mul %81, %6 : i64 %154 = llvm.mul %15, %5 : i64 %155 = llvm.add %153, %154 : i64 %156 = llvm.mul %86, %10 : i64 %157 = llvm.add %155, %156 : i64 %158 = llvm.add %157, %89 : i64 %159 = llvm.getelementptr %56[%158] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %160 = llvm.load %159 : !llvm.ptr -> f32 %161 = llvm.insertelement %160, %1[%0 : i32] : vector<3xf32> %162 = llvm.shufflevector %161, %1 [0, 0, 0] : vector<3xf32> %163 = llvm.intr.fmuladd(%101, %162, %152) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %164 = llvm.mul %81, %6 : i64 %165 = llvm.mul %13, %5 : i64 %166 = llvm.add %164, %165 : i64 %167 = llvm.mul %86, %10 : i64 %168 = llvm.add %166, %167 : i64 %169 = llvm.add %168, %89 : i64 %170 = llvm.getelementptr %56[%169] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %171 = llvm.load %170 : !llvm.ptr -> f32 %172 = llvm.insertelement %171, %1[%0 : i32] : vector<3xf32> %173 = llvm.shufflevector %172, %1 [0, 0, 0] : vector<3xf32> %174 = llvm.intr.fmuladd(%111, %173, %163) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %175 = llvm.mul %81, %6 : i64 %176 = llvm.mul %14, %5 : i64 %177 = llvm.add %175, %176 : i64 %178 = llvm.mul %86, %10 : i64 %179 = llvm.add %177, %178 : i64 %180 = llvm.add %179, %89 : i64 %181 = llvm.getelementptr %56[%180] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %182 = llvm.load %181 : !llvm.ptr -> f32 %183 = llvm.insertelement %182, %1[%0 : i32] : vector<3xf32> %184 = llvm.shufflevector %183, %1 [0, 0, 0] : vector<3xf32> %185 = llvm.intr.fmuladd(%121, %184, %174) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %186 = llvm.mul %81, %6 : i64 %187 = llvm.mul %11, %5 : i64 %188 = llvm.add %186, %187 : i64 %189 = llvm.mul %86, %10 : i64 %190 = llvm.add %188, %189 : i64 %191 = llvm.add %190, %89 : i64 %192 = llvm.getelementptr %56[%191] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %193 = llvm.load %192 : !llvm.ptr -> f32 %194 = llvm.insertelement %193, %1[%0 : i32] : vector<3xf32> %195 = llvm.shufflevector %194, %1 [0, 0, 0] : vector<3xf32> %196 = llvm.intr.fmuladd(%131, %195, %185) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %197 = llvm.mul %81, %6 : i64 %198 = llvm.mul %8, %5 : i64 %199 = llvm.add %197, %198 : i64 %200 = llvm.mul %86, %10 : i64 %201 = llvm.add %199, %200 : i64 %202 = llvm.add %201, %89 : i64 %203 = llvm.getelementptr %56[%202] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %204 = llvm.load %203 : !llvm.ptr -> f32 %205 = llvm.insertelement %204, %1[%0 : i32] : vector<3xf32> %206 = llvm.shufflevector %205, %1 [0, 0, 0] : vector<3xf32> %207 = llvm.intr.fmuladd(%141, %206, %196) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %208 = llvm.mul %81, %6 : i64 %209 = llvm.mul %10, %5 : i64 %210 = llvm.add %208, %209 : i64 %211 = llvm.mul %86, %10 : i64 %212 = llvm.add %210, %211 : i64 %213 = llvm.add %212, %89 : i64 %214 = llvm.getelementptr %56[%213] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %215 = llvm.load %214 : !llvm.ptr -> f32 %216 = llvm.insertelement %215, %1[%0 : i32] : vector<3xf32> %217 = llvm.shufflevector %216, %1 [0, 0, 0] : vector<3xf32> %218 = llvm.intr.fmuladd(%151, %217, %207) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %219 = llvm.insertvalue %218, %7[0] : !llvm.array<1 x vector<3xf32>> %220 = llvm.insertvalue %219, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %221 = llvm.add %89, %13 : i64 llvm.br ^bb4(%221, %220 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %222 = llvm.add %86, %13 : i64 llvm.br ^bb3(%222, %90 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %223 = llvm.extractvalue %87[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %224 = llvm.mul %82, %3 : i64 %225 = llvm.mul %81, %4 : i64 %226 = llvm.add %224, %225 : i64 %227 = llvm.mul %78, %12 : i64 %228 = llvm.add %226, %227 : i64 %229 = llvm.add %228, %84 : i64 %230 = llvm.getelementptr %63[%229] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %223, %230 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %231 = llvm.add %84, %11 : i64 llvm.br ^bb2(%231 : i64) ^bb8: // pred: ^bb2 %232 = llvm.add %82, %13 : i64 llvm.br ^bb1(%232 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } // -----// IR Dump After Canonicalizer (canonicalize) //----- // module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %22 = llvm.extractvalue %21[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %23 = llvm.getelementptr %22[3] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %26 = llvm.extractvalue %25[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %27 = llvm.getelementptr %26[4] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %30 = llvm.extractvalue %29[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %31 = llvm.getelementptr %30[5] : (!llvm.ptr) -> !llvm.ptr, i32 %32 = llvm.load %31 : !llvm.ptr -> i32 %33 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %34 = llvm.extractvalue %33[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %35 = llvm.getelementptr %34[6] : (!llvm.ptr) -> !llvm.ptr, i32 %36 = llvm.load %35 : !llvm.ptr -> i32 %37 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %38 = llvm.extractvalue %37[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %39 = llvm.getelementptr %38[7] : (!llvm.ptr) -> !llvm.ptr, i32 %40 = llvm.load %39 : !llvm.ptr -> i32 %41 = llvm.zext %20 : i32 to i64 %42 = llvm.zext %24 : i32 to i64 %43 = llvm.shl %42, %16 : i64 %44 = llvm.or %41, %43 : i64 %45 = llvm.zext %28 : i32 to i64 %46 = llvm.zext %32 : i32 to i64 %47 = llvm.shl %46, %16 : i64 %48 = llvm.or %45, %47 : i64 %49 = llvm.zext %36 : i32 to i64 %50 = llvm.zext %40 : i32 to i64 %51 = llvm.shl %50, %16 : i64 %52 = llvm.or %49, %51 : i64 %53 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %54 = llvm.extractvalue %53[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %55 = llvm.getelementptr %54[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %56 = llvm.load %55 : !llvm.ptr -> !llvm.ptr %57 = llvm.ptrtoint %56 : !llvm.ptr to i64 %58 = llvm.and %57, %4 : i64 %59 = llvm.icmp "eq" %58, %15 : i64 "llvm.intr.assume"(%59) : (i1) -> () %60 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %61 = llvm.extractvalue %60[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %62 = llvm.getelementptr %61[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %63 = llvm.load %62 : !llvm.ptr -> !llvm.ptr %64 = llvm.ptrtoint %63 : !llvm.ptr to i64 %65 = llvm.and %64, %4 : i64 %66 = llvm.icmp "eq" %65, %15 : i64 "llvm.intr.assume"(%66) : (i1) -> () %67 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %68 = llvm.extractvalue %67[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %69 = llvm.load %68 : !llvm.ptr -> !llvm.ptr %70 = llvm.mul %52, %2 : i64 %71 = llvm.mul %70, %48 : i64 %72 = llvm.mul %71, %44 : i64 %73 = llvm.ptrtoint %69 : !llvm.ptr to i64 %74 = llvm.and %73, %4 : i64 %75 = llvm.icmp "eq" %74, %15 : i64 "llvm.intr.assume"(%75) : (i1) -> () %76 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %77 = llvm.extractvalue %76[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %78 = llvm.zext %77 : i32 to i64 %79 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %80 = llvm.extractvalue %79[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %81 = llvm.zext %80 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%82: i64): // 2 preds: ^bb0, ^bb8 %83 = llvm.icmp "slt" %82, %14 : i64 llvm.cond_br %83, ^bb2(%15 : i64), ^bb9 ^bb2(%84: i64): // 2 preds: ^bb1, ^bb7 %85 = llvm.icmp "slt" %84, %12 : i64 llvm.cond_br %85, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%86: i64, %87: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %88 = llvm.icmp "slt" %86, %10 : i64 llvm.cond_br %88, ^bb4(%15, %87 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%89: i64, %90: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %91 = llvm.icmp "slt" %89, %10 : i64 llvm.cond_br %91, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %92 = llvm.add %78, %86 : i64 %93 = llvm.add %84, %89 : i64 %94 = llvm.mul %82, %72 : i64 %95 = llvm.mul %71, %15 : i64 %96 = llvm.add %94, %95 : i64 %97 = llvm.mul %92, %70 : i64 %98 = llvm.add %96, %97 : i64 %99 = llvm.add %98, %93 : i64 %100 = llvm.getelementptr %69[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %78, %86 : i64 %103 = llvm.add %102, %13 : i64 %104 = llvm.mul %82, %72 : i64 %105 = llvm.mul %71, %15 : i64 %106 = llvm.add %104, %105 : i64 %107 = llvm.mul %103, %70 : i64 %108 = llvm.add %106, %107 : i64 %109 = llvm.add %108, %93 : i64 %110 = llvm.getelementptr %69[%109] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %111 = llvm.load %110 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %112 = llvm.add %78, %86 : i64 %113 = llvm.add %112, %14 : i64 %114 = llvm.mul %82, %72 : i64 %115 = llvm.mul %71, %15 : i64 %116 = llvm.add %114, %115 : i64 %117 = llvm.mul %113, %70 : i64 %118 = llvm.add %116, %117 : i64 %119 = llvm.add %118, %93 : i64 %120 = llvm.getelementptr %69[%119] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %121 = llvm.load %120 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %122 = llvm.add %78, %86 : i64 %123 = llvm.add %122, %11 : i64 %124 = llvm.mul %82, %72 : i64 %125 = llvm.mul %71, %15 : i64 %126 = llvm.add %124, %125 : i64 %127 = llvm.mul %123, %70 : i64 %128 = llvm.add %126, %127 : i64 %129 = llvm.add %128, %93 : i64 %130 = llvm.getelementptr %69[%129] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %131 = llvm.load %130 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %132 = llvm.add %78, %86 : i64 %133 = llvm.add %132, %8 : i64 %134 = llvm.mul %82, %72 : i64 %135 = llvm.mul %71, %15 : i64 %136 = llvm.add %134, %135 : i64 %137 = llvm.mul %133, %70 : i64 %138 = llvm.add %136, %137 : i64 %139 = llvm.add %138, %93 : i64 %140 = llvm.getelementptr %69[%139] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %141 = llvm.load %140 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %142 = llvm.add %78, %86 : i64 %143 = llvm.add %142, %10 : i64 %144 = llvm.mul %82, %72 : i64 %145 = llvm.mul %71, %15 : i64 %146 = llvm.add %144, %145 : i64 %147 = llvm.mul %143, %70 : i64 %148 = llvm.add %146, %147 : i64 %149 = llvm.add %148, %93 : i64 %150 = llvm.getelementptr %69[%149] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %151 = llvm.load %150 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %152 = llvm.extractvalue %90[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %153 = llvm.mul %81, %6 : i64 %154 = llvm.mul %15, %5 : i64 %155 = llvm.add %153, %154 : i64 %156 = llvm.mul %86, %10 : i64 %157 = llvm.add %155, %156 : i64 %158 = llvm.add %157, %89 : i64 %159 = llvm.getelementptr %56[%158] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %160 = llvm.load %159 : !llvm.ptr -> f32 %161 = llvm.insertelement %160, %1[%0 : i32] : vector<3xf32> %162 = llvm.shufflevector %161, %1 [0, 0, 0] : vector<3xf32> %163 = llvm.intr.fmuladd(%101, %162, %152) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %164 = llvm.mul %81, %6 : i64 %165 = llvm.mul %13, %5 : i64 %166 = llvm.add %164, %165 : i64 %167 = llvm.mul %86, %10 : i64 %168 = llvm.add %166, %167 : i64 %169 = llvm.add %168, %89 : i64 %170 = llvm.getelementptr %56[%169] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %171 = llvm.load %170 : !llvm.ptr -> f32 %172 = llvm.insertelement %171, %1[%0 : i32] : vector<3xf32> %173 = llvm.shufflevector %172, %1 [0, 0, 0] : vector<3xf32> %174 = llvm.intr.fmuladd(%111, %173, %163) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %175 = llvm.mul %81, %6 : i64 %176 = llvm.mul %14, %5 : i64 %177 = llvm.add %175, %176 : i64 %178 = llvm.mul %86, %10 : i64 %179 = llvm.add %177, %178 : i64 %180 = llvm.add %179, %89 : i64 %181 = llvm.getelementptr %56[%180] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %182 = llvm.load %181 : !llvm.ptr -> f32 %183 = llvm.insertelement %182, %1[%0 : i32] : vector<3xf32> %184 = llvm.shufflevector %183, %1 [0, 0, 0] : vector<3xf32> %185 = llvm.intr.fmuladd(%121, %184, %174) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %186 = llvm.mul %81, %6 : i64 %187 = llvm.mul %11, %5 : i64 %188 = llvm.add %186, %187 : i64 %189 = llvm.mul %86, %10 : i64 %190 = llvm.add %188, %189 : i64 %191 = llvm.add %190, %89 : i64 %192 = llvm.getelementptr %56[%191] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %193 = llvm.load %192 : !llvm.ptr -> f32 %194 = llvm.insertelement %193, %1[%0 : i32] : vector<3xf32> %195 = llvm.shufflevector %194, %1 [0, 0, 0] : vector<3xf32> %196 = llvm.intr.fmuladd(%131, %195, %185) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %197 = llvm.mul %81, %6 : i64 %198 = llvm.mul %8, %5 : i64 %199 = llvm.add %197, %198 : i64 %200 = llvm.mul %86, %10 : i64 %201 = llvm.add %199, %200 : i64 %202 = llvm.add %201, %89 : i64 %203 = llvm.getelementptr %56[%202] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %204 = llvm.load %203 : !llvm.ptr -> f32 %205 = llvm.insertelement %204, %1[%0 : i32] : vector<3xf32> %206 = llvm.shufflevector %205, %1 [0, 0, 0] : vector<3xf32> %207 = llvm.intr.fmuladd(%141, %206, %196) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %208 = llvm.mul %81, %6 : i64 %209 = llvm.mul %10, %5 : i64 %210 = llvm.add %208, %209 : i64 %211 = llvm.mul %86, %10 : i64 %212 = llvm.add %210, %211 : i64 %213 = llvm.add %212, %89 : i64 %214 = llvm.getelementptr %56[%213] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %215 = llvm.load %214 : !llvm.ptr -> f32 %216 = llvm.insertelement %215, %1[%0 : i32] : vector<3xf32> %217 = llvm.shufflevector %216, %1 [0, 0, 0] : vector<3xf32> %218 = llvm.intr.fmuladd(%151, %217, %207) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %219 = llvm.insertvalue %218, %7[0] : !llvm.array<1 x vector<3xf32>> %220 = llvm.insertvalue %219, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %221 = llvm.add %89, %13 : i64 llvm.br ^bb4(%221, %220 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %222 = llvm.add %86, %13 : i64 llvm.br ^bb3(%222, %90 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %223 = llvm.extractvalue %87[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %224 = llvm.mul %82, %3 : i64 %225 = llvm.mul %81, %4 : i64 %226 = llvm.add %224, %225 : i64 %227 = llvm.mul %78, %12 : i64 %228 = llvm.add %226, %227 : i64 %229 = llvm.add %228, %84 : i64 %230 = llvm.getelementptr %63[%229] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %223, %230 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %231 = llvm.add %84, %11 : i64 llvm.br ^bb2(%231 : i64) ^bb8: // pred: ^bb2 %232 = llvm.add %82, %13 : i64 llvm.br ^bb1(%232 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } // -----// IR Dump Before CSE (cse) //----- // module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %22 = llvm.extractvalue %21[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %23 = llvm.getelementptr %22[3] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %26 = llvm.extractvalue %25[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %27 = llvm.getelementptr %26[4] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %30 = llvm.extractvalue %29[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %31 = llvm.getelementptr %30[5] : (!llvm.ptr) -> !llvm.ptr, i32 %32 = llvm.load %31 : !llvm.ptr -> i32 %33 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %34 = llvm.extractvalue %33[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %35 = llvm.getelementptr %34[6] : (!llvm.ptr) -> !llvm.ptr, i32 %36 = llvm.load %35 : !llvm.ptr -> i32 %37 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %38 = llvm.extractvalue %37[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %39 = llvm.getelementptr %38[7] : (!llvm.ptr) -> !llvm.ptr, i32 %40 = llvm.load %39 : !llvm.ptr -> i32 %41 = llvm.zext %20 : i32 to i64 %42 = llvm.zext %24 : i32 to i64 %43 = llvm.shl %42, %16 : i64 %44 = llvm.or %41, %43 : i64 %45 = llvm.zext %28 : i32 to i64 %46 = llvm.zext %32 : i32 to i64 %47 = llvm.shl %46, %16 : i64 %48 = llvm.or %45, %47 : i64 %49 = llvm.zext %36 : i32 to i64 %50 = llvm.zext %40 : i32 to i64 %51 = llvm.shl %50, %16 : i64 %52 = llvm.or %49, %51 : i64 %53 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %54 = llvm.extractvalue %53[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %55 = llvm.getelementptr %54[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %56 = llvm.load %55 : !llvm.ptr -> !llvm.ptr %57 = llvm.ptrtoint %56 : !llvm.ptr to i64 %58 = llvm.and %57, %4 : i64 %59 = llvm.icmp "eq" %58, %15 : i64 "llvm.intr.assume"(%59) : (i1) -> () %60 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %61 = llvm.extractvalue %60[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %62 = llvm.getelementptr %61[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %63 = llvm.load %62 : !llvm.ptr -> !llvm.ptr %64 = llvm.ptrtoint %63 : !llvm.ptr to i64 %65 = llvm.and %64, %4 : i64 %66 = llvm.icmp "eq" %65, %15 : i64 "llvm.intr.assume"(%66) : (i1) -> () %67 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %68 = llvm.extractvalue %67[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %69 = llvm.load %68 : !llvm.ptr -> !llvm.ptr %70 = llvm.mul %52, %2 : i64 %71 = llvm.mul %70, %48 : i64 %72 = llvm.mul %71, %44 : i64 %73 = llvm.ptrtoint %69 : !llvm.ptr to i64 %74 = llvm.and %73, %4 : i64 %75 = llvm.icmp "eq" %74, %15 : i64 "llvm.intr.assume"(%75) : (i1) -> () %76 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %77 = llvm.extractvalue %76[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %78 = llvm.zext %77 : i32 to i64 %79 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %80 = llvm.extractvalue %79[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %81 = llvm.zext %80 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%82: i64): // 2 preds: ^bb0, ^bb8 %83 = llvm.icmp "slt" %82, %14 : i64 llvm.cond_br %83, ^bb2(%15 : i64), ^bb9 ^bb2(%84: i64): // 2 preds: ^bb1, ^bb7 %85 = llvm.icmp "slt" %84, %12 : i64 llvm.cond_br %85, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%86: i64, %87: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %88 = llvm.icmp "slt" %86, %10 : i64 llvm.cond_br %88, ^bb4(%15, %87 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%89: i64, %90: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %91 = llvm.icmp "slt" %89, %10 : i64 llvm.cond_br %91, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %92 = llvm.add %78, %86 : i64 %93 = llvm.add %84, %89 : i64 %94 = llvm.mul %82, %72 : i64 %95 = llvm.mul %71, %15 : i64 %96 = llvm.add %94, %95 : i64 %97 = llvm.mul %92, %70 : i64 %98 = llvm.add %96, %97 : i64 %99 = llvm.add %98, %93 : i64 %100 = llvm.getelementptr %69[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %78, %86 : i64 %103 = llvm.add %102, %13 : i64 %104 = llvm.mul %82, %72 : i64 %105 = llvm.mul %71, %15 : i64 %106 = llvm.add %104, %105 : i64 %107 = llvm.mul %103, %70 : i64 %108 = llvm.add %106, %107 : i64 %109 = llvm.add %108, %93 : i64 %110 = llvm.getelementptr %69[%109] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %111 = llvm.load %110 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %112 = llvm.add %78, %86 : i64 %113 = llvm.add %112, %14 : i64 %114 = llvm.mul %82, %72 : i64 %115 = llvm.mul %71, %15 : i64 %116 = llvm.add %114, %115 : i64 %117 = llvm.mul %113, %70 : i64 %118 = llvm.add %116, %117 : i64 %119 = llvm.add %118, %93 : i64 %120 = llvm.getelementptr %69[%119] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %121 = llvm.load %120 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %122 = llvm.add %78, %86 : i64 %123 = llvm.add %122, %11 : i64 %124 = llvm.mul %82, %72 : i64 %125 = llvm.mul %71, %15 : i64 %126 = llvm.add %124, %125 : i64 %127 = llvm.mul %123, %70 : i64 %128 = llvm.add %126, %127 : i64 %129 = llvm.add %128, %93 : i64 %130 = llvm.getelementptr %69[%129] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %131 = llvm.load %130 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %132 = llvm.add %78, %86 : i64 %133 = llvm.add %132, %8 : i64 %134 = llvm.mul %82, %72 : i64 %135 = llvm.mul %71, %15 : i64 %136 = llvm.add %134, %135 : i64 %137 = llvm.mul %133, %70 : i64 %138 = llvm.add %136, %137 : i64 %139 = llvm.add %138, %93 : i64 %140 = llvm.getelementptr %69[%139] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %141 = llvm.load %140 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %142 = llvm.add %78, %86 : i64 %143 = llvm.add %142, %10 : i64 %144 = llvm.mul %82, %72 : i64 %145 = llvm.mul %71, %15 : i64 %146 = llvm.add %144, %145 : i64 %147 = llvm.mul %143, %70 : i64 %148 = llvm.add %146, %147 : i64 %149 = llvm.add %148, %93 : i64 %150 = llvm.getelementptr %69[%149] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %151 = llvm.load %150 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %152 = llvm.extractvalue %90[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %153 = llvm.mul %81, %6 : i64 %154 = llvm.mul %15, %5 : i64 %155 = llvm.add %153, %154 : i64 %156 = llvm.mul %86, %10 : i64 %157 = llvm.add %155, %156 : i64 %158 = llvm.add %157, %89 : i64 %159 = llvm.getelementptr %56[%158] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %160 = llvm.load %159 : !llvm.ptr -> f32 %161 = llvm.insertelement %160, %1[%0 : i32] : vector<3xf32> %162 = llvm.shufflevector %161, %1 [0, 0, 0] : vector<3xf32> %163 = llvm.intr.fmuladd(%101, %162, %152) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %164 = llvm.mul %81, %6 : i64 %165 = llvm.mul %13, %5 : i64 %166 = llvm.add %164, %165 : i64 %167 = llvm.mul %86, %10 : i64 %168 = llvm.add %166, %167 : i64 %169 = llvm.add %168, %89 : i64 %170 = llvm.getelementptr %56[%169] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %171 = llvm.load %170 : !llvm.ptr -> f32 %172 = llvm.insertelement %171, %1[%0 : i32] : vector<3xf32> %173 = llvm.shufflevector %172, %1 [0, 0, 0] : vector<3xf32> %174 = llvm.intr.fmuladd(%111, %173, %163) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %175 = llvm.mul %81, %6 : i64 %176 = llvm.mul %14, %5 : i64 %177 = llvm.add %175, %176 : i64 %178 = llvm.mul %86, %10 : i64 %179 = llvm.add %177, %178 : i64 %180 = llvm.add %179, %89 : i64 %181 = llvm.getelementptr %56[%180] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %182 = llvm.load %181 : !llvm.ptr -> f32 %183 = llvm.insertelement %182, %1[%0 : i32] : vector<3xf32> %184 = llvm.shufflevector %183, %1 [0, 0, 0] : vector<3xf32> %185 = llvm.intr.fmuladd(%121, %184, %174) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %186 = llvm.mul %81, %6 : i64 %187 = llvm.mul %11, %5 : i64 %188 = llvm.add %186, %187 : i64 %189 = llvm.mul %86, %10 : i64 %190 = llvm.add %188, %189 : i64 %191 = llvm.add %190, %89 : i64 %192 = llvm.getelementptr %56[%191] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %193 = llvm.load %192 : !llvm.ptr -> f32 %194 = llvm.insertelement %193, %1[%0 : i32] : vector<3xf32> %195 = llvm.shufflevector %194, %1 [0, 0, 0] : vector<3xf32> %196 = llvm.intr.fmuladd(%131, %195, %185) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %197 = llvm.mul %81, %6 : i64 %198 = llvm.mul %8, %5 : i64 %199 = llvm.add %197, %198 : i64 %200 = llvm.mul %86, %10 : i64 %201 = llvm.add %199, %200 : i64 %202 = llvm.add %201, %89 : i64 %203 = llvm.getelementptr %56[%202] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %204 = llvm.load %203 : !llvm.ptr -> f32 %205 = llvm.insertelement %204, %1[%0 : i32] : vector<3xf32> %206 = llvm.shufflevector %205, %1 [0, 0, 0] : vector<3xf32> %207 = llvm.intr.fmuladd(%141, %206, %196) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %208 = llvm.mul %81, %6 : i64 %209 = llvm.mul %10, %5 : i64 %210 = llvm.add %208, %209 : i64 %211 = llvm.mul %86, %10 : i64 %212 = llvm.add %210, %211 : i64 %213 = llvm.add %212, %89 : i64 %214 = llvm.getelementptr %56[%213] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %215 = llvm.load %214 : !llvm.ptr -> f32 %216 = llvm.insertelement %215, %1[%0 : i32] : vector<3xf32> %217 = llvm.shufflevector %216, %1 [0, 0, 0] : vector<3xf32> %218 = llvm.intr.fmuladd(%151, %217, %207) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %219 = llvm.insertvalue %218, %7[0] : !llvm.array<1 x vector<3xf32>> %220 = llvm.insertvalue %219, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %221 = llvm.add %89, %13 : i64 llvm.br ^bb4(%221, %220 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %222 = llvm.add %86, %13 : i64 llvm.br ^bb3(%222, %90 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %223 = llvm.extractvalue %87[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %224 = llvm.mul %82, %3 : i64 %225 = llvm.mul %81, %4 : i64 %226 = llvm.add %224, %225 : i64 %227 = llvm.mul %78, %12 : i64 %228 = llvm.add %226, %227 : i64 %229 = llvm.add %228, %84 : i64 %230 = llvm.getelementptr %63[%229] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %223, %230 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %231 = llvm.add %84, %11 : i64 llvm.br ^bb2(%231 : i64) ^bb8: // pred: ^bb2 %232 = llvm.add %82, %13 : i64 llvm.br ^bb1(%232 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } // -----// IR Dump After CSE (cse) //----- // module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } // -----// IR Dump Before AddFastMathFlagsPass (iree-codegen-add-fast-math-flags) //----- // llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } // -----// IR Dump After AddFastMathFlagsPass (iree-codegen-add-fast-math-flags) //----- // llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } // -----// IR Dump After TranslateTargetExecutableVariantsPass (iree-hal-translate-target-executable-variants) //----- // hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } // -----// IR Dump After TranslateExecutablesPass (iree-hal-translate-executables) //----- // hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } // -----// IR Dump Before ConvertToHALPass (iree-hal-conversion) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_local hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor{%0, %1, %2, %3} in !stream.resource{%7} hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource{%c2400} %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c2016} => !stream.timepoint %10 = arith.index_castui %0 : index to i64 %11 = arith.trunci %10 : i64 to i32 %12 = arith.shrui %10, %c32_i64 : i64 %13 = arith.trunci %12 : i64 to i32 %14 = arith.index_castui %1 : index to i64 %15 = arith.trunci %14 : i64 to i32 %16 = arith.shrui %14, %c32_i64 : i64 %17 = arith.trunci %16 : i64 to i32 %18 = arith.index_castui %2 : index to i64 %19 = arith.trunci %18 : i64 to i32 %20 = arith.shrui %18, %c32_i64 : i64 %21 = arith.trunci %20 : i64 to i32 %22 = arith.index_castui %3 : index to i64 %23 = arith.trunci %22 : i64 to i32 %24 = arith.shrui %22, %c32_i64 : i64 %25 = arith.trunci %24 : i64 to i32 %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource{%7}, %9 as %arg3: !stream.resource{%c2400}, %result as %arg4: !stream.resource{%c2016}) { stream.cmd.dispatch @main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) { ro %arg2[%c0 for %7] : !stream.resource{%7}, ro %arg3[%c0 for %c2400] : !stream.resource{%c2400}, wo %arg4[%c0 for %c2016] : !stream.resource{%c2016} } } => !stream.timepoint %27 = stream.timepoint.await %26 => %result : !stream.resource{%c2016} %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource{%c2016} -> !hal.buffer_view util.return %28 : !hal.buffer_view } } // -----// IR Dump After ConvertToHALPass (iree-hal-conversion) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %__device_0 = util.global.load immutable @__device_0 : !hal.device %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%7) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer %__device_0_1 = util.global.load immutable @__device_0 : !hal.device %allocator_2 = hal.device.allocator<%__device_0_1 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator_2 : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %__device_0_3 = util.global.load immutable @__device_0 : !hal.device %c-1_i64 = arith.constant -1 : i64 %8 = util.null : !hal.fence %fence = hal.fence.create device(%__device_0_3 : !hal.device) flags("None") : !hal.fence %c0_i64 = arith.constant 0 : i64 %transient_buffer = hal.device.queue.alloca<%__device_0_3 : !hal.device> affinity(%c-1_i64) wait(%8) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %0 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %1 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %2 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %3 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %__device_0_4 = util.global.load immutable @__device_0 : !hal.device %c-1_i64_5 = arith.constant -1 : i64 %cmd = hal.command_buffer.create device(%__device_0_4 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64_5) : !hal.command_buffer %25 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device %pipeline_layout = hal.pipeline_layout.lookup device(%25 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 %c0_6 = arith.constant 0 : index %c1 = arith.constant 1 : index %c2_7 = arith.constant 2 : index %c0_8 = arith.constant 0 : index hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0_8] bindings([ %c0_6 = (%buffer : !hal.buffer)[%c0, %7], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2_7 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %c7 = arith.constant 7 : index %c4_9 = arith.constant 4 : index %c1_10 = arith.constant 1 : index %exe = hal.executable.lookup device(%25 : !hal.device) executable(@main_dispatch_0) : !hal.executable %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4_9, %c1_10]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_11 = hal.fence.create device(%__device_0_4 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0_4 : !hal.device> affinity(%c-1_i64_5) wait(%fence) signal(%fence_11) commands([%cmd]) %c-1_i32 = arith.constant -1 : i32 %status = hal.fence.await until([%fence_11]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %dense_row_major_12 = hal.encoding_type : i32 %element_type_f32_13 = hal.element_type : i32 %c2_14 = arith.constant 2 : index %c4_15 = arith.constant 4 : index %c7_16 = arith.constant 7 : index %c9 = arith.constant 9 : index %c0_17 = arith.constant 0 : index %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0_17, %c2016] shape([%c2_14, %c4_15, %c7_16, %c9]) type(%element_type_f32_13) encoding(%dense_row_major_12) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before InlineMemoizeRegionsPass (iree-hal-inline-memoize-regions) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %__device_0 = util.global.load immutable @__device_0 : !hal.device %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%7) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer %__device_0_1 = util.global.load immutable @__device_0 : !hal.device %allocator_2 = hal.device.allocator<%__device_0_1 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator_2 : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %__device_0_3 = util.global.load immutable @__device_0 : !hal.device %c-1_i64 = arith.constant -1 : i64 %8 = util.null : !hal.fence %fence = hal.fence.create device(%__device_0_3 : !hal.device) flags("None") : !hal.fence %c0_i64 = arith.constant 0 : i64 %transient_buffer = hal.device.queue.alloca<%__device_0_3 : !hal.device> affinity(%c-1_i64) wait(%8) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %0 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %1 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %2 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %3 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %__device_0_4 = util.global.load immutable @__device_0 : !hal.device %c-1_i64_5 = arith.constant -1 : i64 %cmd = hal.command_buffer.create device(%__device_0_4 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64_5) : !hal.command_buffer %25 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device %pipeline_layout = hal.pipeline_layout.lookup device(%25 : !hal.device) layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 %c0_6 = arith.constant 0 : index %c1 = arith.constant 1 : index %c2_7 = arith.constant 2 : index %c0_8 = arith.constant 0 : index hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0_8] bindings([ %c0_6 = (%buffer : !hal.buffer)[%c0, %7], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2_7 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %c7 = arith.constant 7 : index %c4_9 = arith.constant 4 : index %c1_10 = arith.constant 1 : index %exe = hal.executable.lookup device(%25 : !hal.device) executable(@main_dispatch_0) : !hal.executable %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4_9, %c1_10]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_11 = hal.fence.create device(%__device_0_4 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0_4 : !hal.device> affinity(%c-1_i64_5) wait(%fence) signal(%fence_11) commands([%cmd]) %c-1_i32 = arith.constant -1 : i32 %status = hal.fence.await until([%fence_11]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %dense_row_major_12 = hal.encoding_type : i32 %element_type_f32_13 = hal.element_type : i32 %c2_14 = arith.constant 2 : index %c4_15 = arith.constant 4 : index %c7_16 = arith.constant 7 : index %c9 = arith.constant 9 : index %c0_17 = arith.constant 0 : index %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0_17, %c2016] shape([%c2_14, %c4_15, %c7_16, %c9]) type(%element_type_f32_13) encoding(%dense_row_major_12) : !hal.buffer_view util.return %view : !hal.buffer_view } // -----// IR Dump After InlineMemoizeRegionsPass (iree-hal-inline-memoize-regions) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %__device_0 = util.global.load immutable @__device_0 : !hal.device %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%7) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer %__device_0_1 = util.global.load immutable @__device_0 : !hal.device %allocator_2 = hal.device.allocator<%__device_0_1 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator_2 : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %__device_0_3 = util.global.load immutable @__device_0 : !hal.device %c-1_i64 = arith.constant -1 : i64 %8 = util.null : !hal.fence %fence = hal.fence.create device(%__device_0_3 : !hal.device) flags("None") : !hal.fence %c0_i64 = arith.constant 0 : i64 %transient_buffer = hal.device.queue.alloca<%__device_0_3 : !hal.device> affinity(%c-1_i64) wait(%8) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %0 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %1 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %2 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %3 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %__device_0_4 = util.global.load immutable @__device_0 : !hal.device %c-1_i64_5 = arith.constant -1 : i64 %cmd = hal.command_buffer.create device(%__device_0_4 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64_5) : !hal.command_buffer %25 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device %pipeline_layout = hal.pipeline_layout.lookup device(%25 : !hal.device) layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 %c0_6 = arith.constant 0 : index %c1 = arith.constant 1 : index %c2_7 = arith.constant 2 : index %c0_8 = arith.constant 0 : index hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0_8] bindings([ %c0_6 = (%buffer : !hal.buffer)[%c0, %7], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2_7 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %c7 = arith.constant 7 : index %c4_9 = arith.constant 4 : index %c1_10 = arith.constant 1 : index %exe = hal.executable.lookup device(%25 : !hal.device) executable(@main_dispatch_0) : !hal.executable %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4_9, %c1_10]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_11 = hal.fence.create device(%__device_0_4 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0_4 : !hal.device> affinity(%c-1_i64_5) wait(%fence) signal(%fence_11) commands([%cmd]) %c-1_i32 = arith.constant -1 : i32 %status = hal.fence.await until([%fence_11]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %dense_row_major_12 = hal.encoding_type : i32 %element_type_f32_13 = hal.element_type : i32 %c2_14 = arith.constant 2 : index %c4_15 = arith.constant 4 : index %c7_16 = arith.constant 7 : index %c9 = arith.constant 9 : index %c0_17 = arith.constant 0 : index %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0_17, %c2016] shape([%c2_14, %c4_15, %c7_16, %c9]) type(%element_type_f32_13) encoding(%dense_row_major_12) : !hal.buffer_view util.return %view : !hal.buffer_view } // -----// IR Dump Before FixupLegacySyncPass (iree-hal-fixup-legacy-sync) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %__device_0 = util.global.load immutable @__device_0 : !hal.device %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%7) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer %__device_0_1 = util.global.load immutable @__device_0 : !hal.device %allocator_2 = hal.device.allocator<%__device_0_1 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator_2 : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %__device_0_3 = util.global.load immutable @__device_0 : !hal.device %c-1_i64 = arith.constant -1 : i64 %8 = util.null : !hal.fence %fence = hal.fence.create device(%__device_0_3 : !hal.device) flags("None") : !hal.fence %c0_i64 = arith.constant 0 : i64 %transient_buffer = hal.device.queue.alloca<%__device_0_3 : !hal.device> affinity(%c-1_i64) wait(%8) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %0 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %1 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %2 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %3 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %__device_0_4 = util.global.load immutable @__device_0 : !hal.device %c-1_i64_5 = arith.constant -1 : i64 %cmd = hal.command_buffer.create device(%__device_0_4 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64_5) : !hal.command_buffer %25 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device %pipeline_layout = hal.pipeline_layout.lookup device(%25 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 %c0_6 = arith.constant 0 : index %c1 = arith.constant 1 : index %c2_7 = arith.constant 2 : index %c0_8 = arith.constant 0 : index hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0_8] bindings([ %c0_6 = (%buffer : !hal.buffer)[%c0, %7], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2_7 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %c7 = arith.constant 7 : index %c4_9 = arith.constant 4 : index %c1_10 = arith.constant 1 : index %exe = hal.executable.lookup device(%25 : !hal.device) executable(@main_dispatch_0) : !hal.executable %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4_9, %c1_10]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_11 = hal.fence.create device(%__device_0_4 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0_4 : !hal.device> affinity(%c-1_i64_5) wait(%fence) signal(%fence_11) commands([%cmd]) %c-1_i32 = arith.constant -1 : i32 %status = hal.fence.await until([%fence_11]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %dense_row_major_12 = hal.encoding_type : i32 %element_type_f32_13 = hal.element_type : i32 %c2_14 = arith.constant 2 : index %c4_15 = arith.constant 4 : index %c7_16 = arith.constant 7 : index %c9 = arith.constant 9 : index %c0_17 = arith.constant 0 : index %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0_17, %c2016] shape([%c2_14, %c4_15, %c7_16, %c9]) type(%element_type_f32_13) encoding(%dense_row_major_12) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After FixupLegacySyncPass (iree-hal-fixup-legacy-sync) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %__device_0 = util.global.load immutable @__device_0 : !hal.device %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%7) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer %__device_0_1 = util.global.load immutable @__device_0 : !hal.device %allocator_2 = hal.device.allocator<%__device_0_1 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator_2 : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %__device_0_3 = util.global.load immutable @__device_0 : !hal.device %c-1_i64 = arith.constant -1 : i64 %8 = util.null : !hal.fence %fence = hal.fence.create device(%__device_0_3 : !hal.device) flags("None") : !hal.fence %c0_i64 = arith.constant 0 : i64 %transient_buffer = hal.device.queue.alloca<%__device_0_3 : !hal.device> affinity(%c-1_i64) wait(%8) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %0 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %1 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %2 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %3 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %__device_0_4 = util.global.load immutable @__device_0 : !hal.device %c-1_i64_5 = arith.constant -1 : i64 %cmd = hal.command_buffer.create device(%__device_0_4 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64_5) : !hal.command_buffer %25 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device %pipeline_layout = hal.pipeline_layout.lookup device(%25 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 %c0_6 = arith.constant 0 : index %c1 = arith.constant 1 : index %c2_7 = arith.constant 2 : index %c0_8 = arith.constant 0 : index hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0_8] bindings([ %c0_6 = (%buffer : !hal.buffer)[%c0, %7], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2_7 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %c7 = arith.constant 7 : index %c4_9 = arith.constant 4 : index %c1_10 = arith.constant 1 : index %exe = hal.executable.lookup device(%25 : !hal.device) executable(@main_dispatch_0) : !hal.executable %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4_9, %c1_10]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_11 = hal.fence.create device(%__device_0_4 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0_4 : !hal.device> affinity(%c-1_i64_5) wait(%fence) signal(%fence_11) commands([%cmd]) %c-1_i32 = arith.constant -1 : i32 %status = hal.fence.await until([%fence_11]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %dense_row_major_12 = hal.encoding_type : i32 %element_type_f32_13 = hal.element_type : i32 %c2_14 = arith.constant 2 : index %c4_15 = arith.constant 4 : index %c7_16 = arith.constant 7 : index %c9 = arith.constant 9 : index %c0_17 = arith.constant 0 : index %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0_17, %c2016] shape([%c2_14, %c4_15, %c7_16, %c9]) type(%element_type_f32_13) encoding(%dense_row_major_12) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before PruneExecutablesPass (iree-hal-prune-executables) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %__device_0 = util.global.load immutable @__device_0 : !hal.device %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%7) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer %__device_0_1 = util.global.load immutable @__device_0 : !hal.device %allocator_2 = hal.device.allocator<%__device_0_1 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator_2 : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %__device_0_3 = util.global.load immutable @__device_0 : !hal.device %c-1_i64 = arith.constant -1 : i64 %8 = util.null : !hal.fence %fence = hal.fence.create device(%__device_0_3 : !hal.device) flags("None") : !hal.fence %c0_i64 = arith.constant 0 : i64 %transient_buffer = hal.device.queue.alloca<%__device_0_3 : !hal.device> affinity(%c-1_i64) wait(%8) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %0 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %1 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %2 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %3 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %__device_0_4 = util.global.load immutable @__device_0 : !hal.device %c-1_i64_5 = arith.constant -1 : i64 %cmd = hal.command_buffer.create device(%__device_0_4 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64_5) : !hal.command_buffer %25 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device %pipeline_layout = hal.pipeline_layout.lookup device(%25 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 %c0_6 = arith.constant 0 : index %c1 = arith.constant 1 : index %c2_7 = arith.constant 2 : index %c0_8 = arith.constant 0 : index hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0_8] bindings([ %c0_6 = (%buffer : !hal.buffer)[%c0, %7], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2_7 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %c7 = arith.constant 7 : index %c4_9 = arith.constant 4 : index %c1_10 = arith.constant 1 : index %exe = hal.executable.lookup device(%25 : !hal.device) executable(@main_dispatch_0) : !hal.executable %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4_9, %c1_10]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_11 = hal.fence.create device(%__device_0_4 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0_4 : !hal.device> affinity(%c-1_i64_5) wait(%fence) signal(%fence_11) commands([%cmd]) %c-1_i32 = arith.constant -1 : i32 %status = hal.fence.await until([%fence_11]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %dense_row_major_12 = hal.encoding_type : i32 %element_type_f32_13 = hal.element_type : i32 %c2_14 = arith.constant 2 : index %c4_15 = arith.constant 4 : index %c7_16 = arith.constant 7 : index %c9 = arith.constant 9 : index %c0_17 = arith.constant 0 : index %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0_17, %c2016] shape([%c2_14, %c4_15, %c7_16, %c9]) type(%element_type_f32_13) encoding(%dense_row_major_12) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After PruneExecutablesPass (iree-hal-prune-executables) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %__device_0 = util.global.load immutable @__device_0 : !hal.device %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%7) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer %__device_0_1 = util.global.load immutable @__device_0 : !hal.device %allocator_2 = hal.device.allocator<%__device_0_1 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator_2 : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %__device_0_3 = util.global.load immutable @__device_0 : !hal.device %c-1_i64 = arith.constant -1 : i64 %8 = util.null : !hal.fence %fence = hal.fence.create device(%__device_0_3 : !hal.device) flags("None") : !hal.fence %c0_i64 = arith.constant 0 : i64 %transient_buffer = hal.device.queue.alloca<%__device_0_3 : !hal.device> affinity(%c-1_i64) wait(%8) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %0 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %1 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %2 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %3 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %__device_0_4 = util.global.load immutable @__device_0 : !hal.device %c-1_i64_5 = arith.constant -1 : i64 %cmd = hal.command_buffer.create device(%__device_0_4 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64_5) : !hal.command_buffer %25 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device %pipeline_layout = hal.pipeline_layout.lookup device(%25 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 %c0_6 = arith.constant 0 : index %c1 = arith.constant 1 : index %c2_7 = arith.constant 2 : index %c0_8 = arith.constant 0 : index hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0_8] bindings([ %c0_6 = (%buffer : !hal.buffer)[%c0, %7], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2_7 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %c7 = arith.constant 7 : index %c4_9 = arith.constant 4 : index %c1_10 = arith.constant 1 : index %exe = hal.executable.lookup device(%25 : !hal.device) executable(@main_dispatch_0) : !hal.executable %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4_9, %c1_10]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_11 = hal.fence.create device(%__device_0_4 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0_4 : !hal.device> affinity(%c-1_i64_5) wait(%fence) signal(%fence_11) commands([%cmd]) %c-1_i32 = arith.constant -1 : i32 %status = hal.fence.await until([%fence_11]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %dense_row_major_12 = hal.encoding_type : i32 %element_type_f32_13 = hal.element_type : i32 %c2_14 = arith.constant 2 : index %c4_15 = arith.constant 4 : index %c7_16 = arith.constant 7 : index %c9 = arith.constant 9 : index %c0_17 = arith.constant 0 : index %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0_17, %c2016] shape([%c2_14, %c4_15, %c7_16, %c9]) type(%element_type_f32_13) encoding(%dense_row_major_12) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before CSE (cse) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %c13 = arith.constant 13 : index %c11 = arith.constant 11 : index %c2 = arith.constant 2 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %__device_0 = util.global.load immutable @__device_0 : !hal.device %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%7) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer %__device_0_1 = util.global.load immutable @__device_0 : !hal.device %allocator_2 = hal.device.allocator<%__device_0_1 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator_2 : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %__device_0_3 = util.global.load immutable @__device_0 : !hal.device %c-1_i64 = arith.constant -1 : i64 %8 = util.null : !hal.fence %fence = hal.fence.create device(%__device_0_3 : !hal.device) flags("None") : !hal.fence %c0_i64 = arith.constant 0 : i64 %transient_buffer = hal.device.queue.alloca<%__device_0_3 : !hal.device> affinity(%c-1_i64) wait(%8) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %0 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %1 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %2 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %3 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %__device_0_4 = util.global.load immutable @__device_0 : !hal.device %c-1_i64_5 = arith.constant -1 : i64 %cmd = hal.command_buffer.create device(%__device_0_4 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64_5) : !hal.command_buffer %25 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device %pipeline_layout = hal.pipeline_layout.lookup device(%25 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 %c0_6 = arith.constant 0 : index %c1 = arith.constant 1 : index %c2_7 = arith.constant 2 : index %c0_8 = arith.constant 0 : index hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0_8] bindings([ %c0_6 = (%buffer : !hal.buffer)[%c0, %7], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2_7 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %c7 = arith.constant 7 : index %c4_9 = arith.constant 4 : index %c1_10 = arith.constant 1 : index %exe = hal.executable.lookup device(%25 : !hal.device) executable(@main_dispatch_0) : !hal.executable %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4_9, %c1_10]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_11 = hal.fence.create device(%__device_0_4 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0_4 : !hal.device> affinity(%c-1_i64_5) wait(%fence) signal(%fence_11) commands([%cmd]) %c-1_i32 = arith.constant -1 : i32 %status = hal.fence.await until([%fence_11]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %dense_row_major_12 = hal.encoding_type : i32 %element_type_f32_13 = hal.element_type : i32 %c2_14 = arith.constant 2 : index %c4_15 = arith.constant 4 : index %c7_16 = arith.constant 7 : index %c9 = arith.constant 9 : index %c0_17 = arith.constant 0 : index %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0_17, %c2016] shape([%c2_14, %c4_15, %c7_16, %c9]) type(%element_type_f32_13) encoding(%dense_row_major_12) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After CSE (cse) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %__device_0 = util.global.load immutable @__device_0 : !hal.device %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%7) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %c-1_i64 = arith.constant -1 : i64 %8 = util.null : !hal.fence %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %c0_i64 = arith.constant 0 : i64 %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%8) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %0 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %1 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %2 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %3 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %25 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device %pipeline_layout = hal.pipeline_layout.lookup device(%25 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %7], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %c7 = arith.constant 7 : index %exe = hal.executable.lookup device(%25 : !hal.device) executable(@main_dispatch_0) : !hal.executable %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %c-1_i32 = arith.constant -1 : i32 %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %c9 = arith.constant 9 : index %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major) %4 = arith.muli %0, %c4 : index %5 = arith.muli %4, %1 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %__device_0 = util.global.load immutable @__device_0 : !hal.device %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%7) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %c-1_i64 = arith.constant -1 : i64 %8 = util.null : !hal.fence %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %c0_i64 = arith.constant 0 : i64 %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%8) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %0 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %1 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %2 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %3 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %25 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device %pipeline_layout = hal.pipeline_layout.lookup device(%25 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %7], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %c7 = arith.constant 7 : index %exe = hal.executable.lookup device(%25 : !hal.device) executable(@main_dispatch_0) : !hal.executable %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %c-1_i32 = arith.constant -1 : i32 %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %c9 = arith.constant 9 : index %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After Canonicalizer (canonicalize) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %__device_0 = util.global.load immutable @__device_0 : !hal.device %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before CSE (cse) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %__device_0 = util.global.load immutable @__device_0 : !hal.device %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After CSE (cse) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %__device_0 = util.global.load immutable @__device_0 : !hal.device %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %__device_0 = util.global.load immutable @__device_0 : !hal.device %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %__device_0 = util.global.load immutable @__device_0 : !hal.device %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } // -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %__device_0 = util.global.load immutable @__device_0 : !hal.device %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before LinkExecutablesPass (iree-hal-link-executables) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before LinkTargetExecutablesPass (iree-hal-link-target-executables) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before LLVMCPULinkExecutablesPass (iree-llvmcpu-link-executables) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After LLVMCPULinkExecutablesPass (iree-llvmcpu-link-executables) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } // -----// IR Dump After Canonicalizer (canonicalize) //----- // hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } // -----// IR Dump Before LLVMCPUAssignConstantOrdinalsPass (iree-llvmcpu-assign-constant-ordinals) //----- // hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } // -----// IR Dump After LLVMCPUAssignConstantOrdinalsPass (iree-llvmcpu-assign-constant-ordinals) //----- // hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } // -----// IR Dump Before LLVMCPUAssignImportOrdinalsPass (iree-llvmcpu-assign-import-ordinals) //----- // hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } // -----// IR Dump After LLVMCPUAssignImportOrdinalsPass (iree-llvmcpu-assign-import-ordinals) //----- // hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } // -----// IR Dump After LinkTargetExecutablesPass (iree-hal-link-target-executables) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before SymbolDCE (symbol-dce) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After SymbolDCE (symbol-dce) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After LinkExecutablesPass (iree-hal-link-executables) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before ResolveExportOrdinalsPass (iree-hal-resolve-export-ordinals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After ResolveExportOrdinalsPass (iree-hal-resolve-export-ordinals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable %c0_1 = arith.constant 0 : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%c0_1] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_2 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_2) commands([%cmd]) %status = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before MaterializeResourceCachesPass (iree-hal-materialize-resource-caches) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable %c0_1 = arith.constant 0 : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%c0_1] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_2 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_2) commands([%cmd]) %status = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After MaterializeResourceCachesPass (iree-hal-materialize-resource-caches) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %__device_0 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %c-1 = arith.constant -1 : index %c0 = arith.constant 0 : index %0 = arith.select %value, %c0, %c-1 : index %1 = scf.index_switch %0 -> !hal.executable case 0 { %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable scf.yield %exe : !hal.executable } default { %c14_i32 = arith.constant 14 : i32 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" %2 = util.null : !hal.executable scf.yield %2 : !hal.executable } util.global.store %1, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.return } hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %__device_0_pipeline_layout_0 = util.global.load @__device_0_pipeline_layout_0 : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %__device_0_executable_0_main_dispatch_0 = util.global.load @__device_0_executable_0_main_dispatch_0 : !hal.executable %c0_1 = arith.constant 0 : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0_1] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_2 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_2) commands([%cmd]) %status = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before MemoizeDeviceQueriesPass (iree-hal-memoize-device-queries) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %__device_0 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %c-1 = arith.constant -1 : index %c0 = arith.constant 0 : index %0 = arith.select %value, %c0, %c-1 : index %1 = scf.index_switch %0 -> !hal.executable case 0 { %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable scf.yield %exe : !hal.executable } default { %c14_i32 = arith.constant 14 : i32 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" %2 = util.null : !hal.executable scf.yield %2 : !hal.executable } util.global.store %1, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.return } hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %__device_0_pipeline_layout_0 = util.global.load @__device_0_pipeline_layout_0 : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %__device_0_executable_0_main_dispatch_0 = util.global.load @__device_0_executable_0_main_dispatch_0 : !hal.executable %c0_1 = arith.constant 0 : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0_1] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_2 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_2) commands([%cmd]) %status = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After MemoizeDeviceQueriesPass (iree-hal-memoize-device-queries) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1 util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.initializer { %__device_0 = util.global.load @__device_0 : !hal.device %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1 util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.return } util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %__device_0 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1 %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %c-1 = arith.constant -1 : index %c0 = arith.constant 0 : index %0 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %1 = scf.index_switch %0 -> !hal.executable case 0 { %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable scf.yield %exe : !hal.executable } default { %c14_i32 = arith.constant 14 : i32 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" %2 = util.null : !hal.executable scf.yield %2 : !hal.executable } util.global.store %1, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.return } hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %__device_0_pipeline_layout_0 = util.global.load @__device_0_pipeline_layout_0 : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %__device_0_executable_0_main_dispatch_0 = util.global.load @__device_0_executable_0_main_dispatch_0 : !hal.executable %c0_1 = arith.constant 0 : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0_1] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_2 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_2) commands([%cmd]) %status = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before CSE (cse) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1 util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.initializer { %__device_0 = util.global.load @__device_0 : !hal.device %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1 util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.return } util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %__device_0 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1 %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %c-1 = arith.constant -1 : index %c0 = arith.constant 0 : index %0 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %1 = scf.index_switch %0 -> !hal.executable case 0 { %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable scf.yield %exe : !hal.executable } default { %c14_i32 = arith.constant 14 : i32 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" %2 = util.null : !hal.executable scf.yield %2 : !hal.executable } util.global.store %1, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.return } hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %__device_0_pipeline_layout_0 = util.global.load @__device_0_pipeline_layout_0 : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %__device_0_executable_0_main_dispatch_0 = util.global.load @__device_0_executable_0_main_dispatch_0 : !hal.executable %c0_1 = arith.constant 0 : index hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0_1] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_2 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_2) commands([%cmd]) %status = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After CSE (cse) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1 util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.initializer { %__device_0 = util.global.load @__device_0 : !hal.device %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1 util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.return } util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %__device_0 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %c-1 = arith.constant -1 : index %c0 = arith.constant 0 : index %0 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %1 = scf.index_switch %0 -> !hal.executable case 0 { %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable scf.yield %exe : !hal.executable } default { %c14_i32 = arith.constant 14 : i32 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" %2 = util.null : !hal.executable scf.yield %2 : !hal.executable } util.global.store %1, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.return } hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %__device_0_pipeline_layout_0 = util.global.load @__device_0_pipeline_layout_0 : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %__device_0_executable_0_main_dispatch_0 = util.global.load @__device_0_executable_0_main_dispatch_0 : !hal.executable hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1 util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.initializer { %__device_0 = util.global.load @__device_0 : !hal.device %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1 util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.return } util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %__device_0 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %c-1 = arith.constant -1 : index %c0 = arith.constant 0 : index %0 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %1 = scf.index_switch %0 -> !hal.executable case 0 { %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable scf.yield %exe : !hal.executable } default { %c14_i32 = arith.constant 14 : i32 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" %2 = util.null : !hal.executable scf.yield %2 : !hal.executable } util.global.store %1, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.return } hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %__device_0_pipeline_layout_0 = util.global.load @__device_0_pipeline_layout_0 : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %__device_0_executable_0_main_dispatch_0 = util.global.load @__device_0_executable_0_main_dispatch_0 : !hal.executable hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After Canonicalizer (canonicalize) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1 util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.initializer { %__device_0 = util.global.load @__device_0 : !hal.device %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1 util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.return } util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c0 = arith.constant 0 : index %c-1 = arith.constant -1 : index %__device_0 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %2 = scf.index_switch %1 -> !hal.executable case 0 { %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable scf.yield %exe : !hal.executable } default { util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" scf.yield %0 : !hal.executable } util.global.store %2, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.return } hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %__device_0_pipeline_layout_0 = util.global.load @__device_0_pipeline_layout_0 : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %__device_0_executable_0_main_dispatch_0 = util.global.load @__device_0_executable_0_main_dispatch_0 : !hal.executable hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before CSE (cse) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1 util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.initializer { %__device_0 = util.global.load @__device_0 : !hal.device %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1 util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.return } util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c0 = arith.constant 0 : index %c-1 = arith.constant -1 : index %__device_0 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %2 = scf.index_switch %1 -> !hal.executable case 0 { %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable scf.yield %exe : !hal.executable } default { util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" scf.yield %0 : !hal.executable } util.global.store %2, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.return } hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %__device_0_pipeline_layout_0 = util.global.load @__device_0_pipeline_layout_0 : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %__device_0_executable_0_main_dispatch_0 = util.global.load @__device_0_executable_0_main_dispatch_0 : !hal.executable hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After CSE (cse) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1 util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.initializer { %__device_0 = util.global.load @__device_0 : !hal.device %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1 util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.return } util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c0 = arith.constant 0 : index %c-1 = arith.constant -1 : index %__device_0 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %2 = scf.index_switch %1 -> !hal.executable case 0 { %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable scf.yield %exe : !hal.executable } default { util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" scf.yield %0 : !hal.executable } util.global.store %2, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.return } hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %__device_0_pipeline_layout_0 = util.global.load @__device_0_pipeline_layout_0 : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %__device_0_executable_0_main_dispatch_0 = util.global.load @__device_0_executable_0_main_dispatch_0 : !hal.executable hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.initializer { %__device_0 = util.global.load @__device_0 : !hal.device %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1 util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.return } // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.initializer { %__device_0 = util.global.load @__device_0 : !hal.device %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1 util.return } // -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c0 = arith.constant 0 : index %c-1 = arith.constant -1 : index %__device_0 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %2 = scf.index_switch %1 -> !hal.executable case 0 { %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable scf.yield %exe : !hal.executable } default { util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" scf.yield %0 : !hal.executable } util.global.store %2, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.return } // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.initializer { %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0 = util.global.load @__device_0 : !hal.device %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c0 = arith.constant 0 : index %c-1 = arith.constant -1 : index %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %2 = scf.index_switch %1 -> !hal.executable case 0 { %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable scf.yield %exe : !hal.executable } default { util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" scf.yield %0 : !hal.executable } util.global.store %2, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } // -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer %__device_0_pipeline_layout_0 = util.global.load @__device_0_pipeline_layout_0 : !hal.pipeline_layout hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) %__device_0_executable_0_main_dispatch_0 = util.global.load @__device_0_executable_0_main_dispatch_0 : !hal.executable hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load @__device_0_executable_0_main_dispatch_0 : !hal.executable %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } // -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1 util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.initializer { %__device_0 = util.global.load @__device_0 : !hal.device %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1 util.return } util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0 = util.global.load @__device_0 : !hal.device %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c0 = arith.constant 0 : index %c-1 = arith.constant -1 : index %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %2 = scf.index_switch %1 -> !hal.executable case 0 { %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable scf.yield %exe : !hal.executable } default { util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" scf.yield %0 : !hal.executable } util.global.store %2, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load @__device_0_executable_0_main_dispatch_0 : !hal.executable %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1 util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.initializer { %__device_0 = util.global.load @__device_0 : !hal.device %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1 util.return } util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %c-1 = arith.constant -1 : index %c0 = arith.constant 0 : index %c14_i32 = arith.constant 14 : i32 %0 = util.null : !hal.executable %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %2 = arith.cmpi eq, %1, %c0 : index %3 = scf.if %2 -> (!hal.executable) { %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable scf.yield %exe : !hal.executable } else { util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" scf.yield %0 : !hal.executable } util.global.store %3, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1 util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.initializer { %__device_0 = util.global.load @__device_0 : !hal.device %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1 util.return } util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %c-1 = arith.constant -1 : index %c0 = arith.constant 0 : index %c14_i32 = arith.constant 14 : i32 %0 = util.null : !hal.executable %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %2 = arith.cmpi eq, %1, %c0 : index %3 = scf.if %2 -> (!hal.executable) { %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable scf.yield %exe : !hal.executable } else { util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" scf.yield %0 : !hal.executable } util.global.store %3, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.initializer { %__device_0 = util.global.load @__device_0 : !hal.device %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.return } util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %c-1 = arith.constant -1 : index %c0 = arith.constant 0 : index %c14_i32 = arith.constant 14 : i32 %0 = util.null : !hal.executable %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %2 = arith.cmpi eq, %1, %c0 : index %3 = scf.if %2 -> (!hal.executable) { %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable scf.yield %exe : !hal.executable } else { util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" scf.yield %0 : !hal.executable } util.global.store %3, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.initializer { %__device_0 = util.global.load @__device_0 : !hal.device %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.return } util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %c-1 = arith.constant -1 : index %c0 = arith.constant 0 : index %c14_i32 = arith.constant 14 : i32 %0 = util.null : !hal.executable %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %2 = arith.cmpi eq, %1, %c0 : index %3 = scf.if %2 -> (!hal.executable) { %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable scf.yield %exe : !hal.executable } else { util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" scf.yield %0 : !hal.executable } util.global.store %3, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.initializer { %__device_0 = util.global.load @__device_0 : !hal.device %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.return } util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %c-1 = arith.constant -1 : index %c0 = arith.constant 0 : index %c14_i32 = arith.constant 14 : i32 %0 = util.null : !hal.executable %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %2 = arith.cmpi eq, %1, %c0 : index %3 = scf.if %2 -> (!hal.executable) { %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable scf.yield %exe : !hal.executable } else { util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" scf.yield %0 : !hal.executable } util.global.store %3, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before ElideRedundantCommandsPass (iree-hal-elide-redundant-commands) //----- // util.initializer { %__device_0 = util.global.load @__device_0 : !hal.device %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.return } // -----// IR Dump After ElideRedundantCommandsPass (iree-hal-elide-redundant-commands) //----- // util.initializer { %__device_0 = util.global.load @__device_0 : !hal.device %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.return } // -----// IR Dump Before ElideRedundantCommandsPass (iree-hal-elide-redundant-commands) //----- // util.initializer { %c-1 = arith.constant -1 : index %c0 = arith.constant 0 : index %c14_i32 = arith.constant 14 : i32 %0 = util.null : !hal.executable %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %2 = arith.cmpi eq, %1, %c0 : index %3 = scf.if %2 -> (!hal.executable) { %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable scf.yield %exe : !hal.executable } else { util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" scf.yield %0 : !hal.executable } util.global.store %3, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } // -----// IR Dump After ElideRedundantCommandsPass (iree-hal-elide-redundant-commands) //----- // util.initializer { %c-1 = arith.constant -1 : index %c0 = arith.constant 0 : index %c14_i32 = arith.constant 14 : i32 %0 = util.null : !hal.executable %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %2 = arith.cmpi eq, %1, %c0 : index %3 = scf.if %2 -> (!hal.executable) { %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable scf.yield %exe : !hal.executable } else { util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" scf.yield %0 : !hal.executable } util.global.store %3, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } // -----// IR Dump Before ElideRedundantCommandsPass (iree-hal-elide-redundant-commands) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } // -----// IR Dump After ElideRedundantCommandsPass (iree-hal-elide-redundant-commands) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } // -----// IR Dump Before InitializeDevicesPass (iree-hal-initialize-devices) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info #device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device module { util.global private @__device_0 = #device_target_local util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.initializer { %__device_0 = util.global.load @__device_0 : !hal.device %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.return } util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %c-1 = arith.constant -1 : index %c0 = arith.constant 0 : index %c14_i32 = arith.constant 14 : i32 %0 = util.null : !hal.executable %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %2 = arith.cmpi eq, %1, %c0 : index %3 = scf.if %2 -> (!hal.executable) { %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable scf.yield %exe : !hal.executable } else { util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" scf.yield %0 : !hal.executable } util.global.store %3, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After InitializeDevicesPass (iree-hal-initialize-devices) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info module { util.global private @__device_0 : !hal.device util.initializer { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %0 = util.null : !hal.device %device_count = hal.devices.count : index %1:3 = scf.while (%arg0 = %c0, %arg1 = %c0, %arg2 = %0) : (index, index, !hal.device) -> (index, index, !hal.device) { %4 = util.cmp.eq %arg2, %0 : !hal.device %5 = arith.cmpi slt, %arg0, %device_count : index %6 = arith.andi %4, %5 : i1 scf.condition(%6) %arg0, %arg1, %arg2 : index, index, !hal.device } do { ^bb0(%arg0: index, %arg1: index, %arg2: !hal.device): %device_n = hal.devices.get %arg0 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false %4 = scf.if %value -> (i1) { %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false scf.yield %value_1 : i1 } else { %false = arith.constant false scf.yield %false : i1 } %5 = arith.cmpi eq, %arg1, %c0 : index %6 = arith.select %4, %c1, %c0 : index %7 = arith.addi %arg1, %6 : index %8 = arith.andi %4, %5 : i1 %9 = arith.select %8, %device_n, %0 : !hal.device %10 = arith.addi %arg0, %c1 : index scf.yield %10, %7, %9 : index, index, !hal.device } %2 = util.null : !hal.device %3 = util.cmp.eq %1#2, %2 : !hal.device scf.if %3 { %c5_i32 = arith.constant 5 : i32 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" } util.global.store %1#2, @__device_0 : !hal.device util.return } util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.initializer { %__device_0 = util.global.load @__device_0 : !hal.device %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.return } util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %c-1 = arith.constant -1 : index %c0 = arith.constant 0 : index %c14_i32 = arith.constant 14 : i32 %0 = util.null : !hal.executable %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %2 = arith.cmpi eq, %1, %c0 : index %3 = scf.if %2 -> (!hal.executable) { %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable scf.yield %exe : !hal.executable } else { util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" scf.yield %0 : !hal.executable } util.global.store %3, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before CombineInitializers (iree-util-combine-initializers) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info module { util.global private @__device_0 : !hal.device util.initializer { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %0 = util.null : !hal.device %device_count = hal.devices.count : index %1:3 = scf.while (%arg0 = %c0, %arg1 = %c0, %arg2 = %0) : (index, index, !hal.device) -> (index, index, !hal.device) { %4 = util.cmp.eq %arg2, %0 : !hal.device %5 = arith.cmpi slt, %arg0, %device_count : index %6 = arith.andi %4, %5 : i1 scf.condition(%6) %arg0, %arg1, %arg2 : index, index, !hal.device } do { ^bb0(%arg0: index, %arg1: index, %arg2: !hal.device): %device_n = hal.devices.get %arg0 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false %4 = scf.if %value -> (i1) { %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false scf.yield %value_1 : i1 } else { %false = arith.constant false scf.yield %false : i1 } %5 = arith.cmpi eq, %arg1, %c0 : index %6 = arith.select %4, %c1, %c0 : index %7 = arith.addi %arg1, %6 : index %8 = arith.andi %4, %5 : i1 %9 = arith.select %8, %device_n, %0 : !hal.device %10 = arith.addi %arg0, %c1 : index scf.yield %10, %7, %9 : index, index, !hal.device } %2 = util.null : !hal.device %3 = util.cmp.eq %1#2, %2 : !hal.device scf.if %3 { %c5_i32 = arith.constant 5 : i32 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" } util.global.store %1#2, @__device_0 : !hal.device util.return } util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.initializer { %__device_0 = util.global.load @__device_0 : !hal.device %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.return } util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %c-1 = arith.constant -1 : index %c0 = arith.constant 0 : index %c14_i32 = arith.constant 14 : i32 %0 = util.null : !hal.executable %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %2 = arith.cmpi eq, %1, %c0 : index %3 = scf.if %2 -> (!hal.executable) { %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable scf.yield %exe : !hal.executable } else { util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" scf.yield %0 : !hal.executable } util.global.store %3, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After CombineInitializers (iree-util-combine-initializers) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info module { util.global private @__device_0 : !hal.device util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %0 = util.null : !hal.device %device_count = hal.devices.count : index %1:3 = scf.while (%arg0 = %c0, %arg1 = %c0, %arg2 = %0) : (index, index, !hal.device) -> (index, index, !hal.device) { %8 = util.cmp.eq %arg2, %0 : !hal.device %9 = arith.cmpi slt, %arg0, %device_count : index %10 = arith.andi %8, %9 : i1 scf.condition(%10) %arg0, %arg1, %arg2 : index, index, !hal.device } do { ^bb0(%arg0: index, %arg1: index, %arg2: !hal.device): %device_n = hal.devices.get %arg0 : !hal.device %ok_2, %value_3 = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false %8 = scf.if %value_3 -> (i1) { %ok_4, %value_5 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false scf.yield %value_5 : i1 } else { %false = arith.constant false scf.yield %false : i1 } %9 = arith.cmpi eq, %arg1, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %arg1, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %0 : !hal.device %14 = arith.addi %arg0, %c1 : index scf.yield %14, %11, %13 : index, index, !hal.device } %2 = util.null : !hal.device %3 = util.cmp.eq %1#2, %2 : !hal.device scf.if %3 { %c5_i32 = arith.constant 5 : i32 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" } util.global.store %1#2, @__device_0 : !hal.device %__device_0 = util.global.load @__device_0 : !hal.device %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %c-1 = arith.constant -1 : index %c0_0 = arith.constant 0 : index %c14_i32 = arith.constant 14 : i32 %4 = util.null : !hal.executable %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_1 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_1 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0_1 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %5 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0_0, %c-1 : index %6 = arith.cmpi eq, %5, %c0_0 : index %7 = scf.if %6 -> (!hal.executable) { %exe = hal.executable.create device(%__device_0_1 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable scf.yield %exe : !hal.executable } else { util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" scf.yield %4 : !hal.executable } util.global.store %7, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before AffineExpandIndexOps (affine-expand-index-ops) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info module { util.global private @__device_0 : !hal.device util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %0 = util.null : !hal.device %device_count = hal.devices.count : index %1:3 = scf.while (%arg0 = %c0, %arg1 = %c0, %arg2 = %0) : (index, index, !hal.device) -> (index, index, !hal.device) { %8 = util.cmp.eq %arg2, %0 : !hal.device %9 = arith.cmpi slt, %arg0, %device_count : index %10 = arith.andi %8, %9 : i1 scf.condition(%10) %arg0, %arg1, %arg2 : index, index, !hal.device } do { ^bb0(%arg0: index, %arg1: index, %arg2: !hal.device): %device_n = hal.devices.get %arg0 : !hal.device %ok_2, %value_3 = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false %8 = scf.if %value_3 -> (i1) { %ok_4, %value_5 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false scf.yield %value_5 : i1 } else { %false = arith.constant false scf.yield %false : i1 } %9 = arith.cmpi eq, %arg1, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %arg1, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %0 : !hal.device %14 = arith.addi %arg0, %c1 : index scf.yield %14, %11, %13 : index, index, !hal.device } %2 = util.null : !hal.device %3 = util.cmp.eq %1#2, %2 : !hal.device scf.if %3 { %c5_i32 = arith.constant 5 : i32 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" } util.global.store %1#2, @__device_0 : !hal.device %__device_0 = util.global.load @__device_0 : !hal.device %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %c-1 = arith.constant -1 : index %c0_0 = arith.constant 0 : index %c14_i32 = arith.constant 14 : i32 %4 = util.null : !hal.executable %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_1 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_1 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0_1 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %5 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0_0, %c-1 : index %6 = arith.cmpi eq, %5, %c0_0 : index %7 = scf.if %6 -> (!hal.executable) { %exe = hal.executable.create device(%__device_0_1 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable scf.yield %exe : !hal.executable } else { util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" scf.yield %4 : !hal.executable } util.global.store %7, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After AffineExpandIndexOps (affine-expand-index-ops) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info module { util.global private @__device_0 : !hal.device util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index %2:3 = scf.while (%arg0 = %c0, %arg1 = %c0, %arg2 = %1) : (index, index, !hal.device) -> (index, index, !hal.device) { %7 = util.cmp.eq %arg2, %1 : !hal.device %8 = arith.cmpi slt, %arg0, %device_count : index %9 = arith.andi %7, %8 : i1 scf.condition(%9) %arg0, %arg1, %arg2 : index, index, !hal.device } do { ^bb0(%arg0: index, %arg1: index, %arg2: !hal.device): %device_n = hal.devices.get %arg0 : !hal.device %ok_1, %value_2 = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false %7 = scf.if %value_2 -> (i1) { %ok_3, %value_4 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false scf.yield %value_4 : i1 } else { scf.yield %false : i1 } %8 = arith.cmpi eq, %arg1, %c0 : index %9 = arith.select %7, %c1, %c0 : index %10 = arith.addi %arg1, %9 : index %11 = arith.andi %7, %8 : i1 %12 = arith.select %11, %device_n, %1 : !hal.device %13 = arith.addi %arg0, %c1 : index scf.yield %13, %10, %12 : index, index, !hal.device } %3 = util.cmp.eq %2#2, %1 : !hal.device scf.if %3 { util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" } util.global.store %2#2, @__device_0 : !hal.device %__device_0 = util.global.load @__device_0 : !hal.device %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_0 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %4 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %5 = arith.cmpi eq, %4, %c0 : index %6 = scf.if %5 -> (!hal.executable) { %exe = hal.executable.create device(%__device_0_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable scf.yield %exe : !hal.executable } else { util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" scf.yield %0 : !hal.executable } util.global.store %6, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before ConvertAffineToStandard (lower-affine) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info module { util.global private @__device_0 : !hal.device util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index %2:3 = scf.while (%arg0 = %c0, %arg1 = %c0, %arg2 = %1) : (index, index, !hal.device) -> (index, index, !hal.device) { %7 = util.cmp.eq %arg2, %1 : !hal.device %8 = arith.cmpi slt, %arg0, %device_count : index %9 = arith.andi %7, %8 : i1 scf.condition(%9) %arg0, %arg1, %arg2 : index, index, !hal.device } do { ^bb0(%arg0: index, %arg1: index, %arg2: !hal.device): %device_n = hal.devices.get %arg0 : !hal.device %ok_1, %value_2 = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false %7 = scf.if %value_2 -> (i1) { %ok_3, %value_4 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false scf.yield %value_4 : i1 } else { scf.yield %false : i1 } %8 = arith.cmpi eq, %arg1, %c0 : index %9 = arith.select %7, %c1, %c0 : index %10 = arith.addi %arg1, %9 : index %11 = arith.andi %7, %8 : i1 %12 = arith.select %11, %device_n, %1 : !hal.device %13 = arith.addi %arg0, %c1 : index scf.yield %13, %10, %12 : index, index, !hal.device } %3 = util.cmp.eq %2#2, %1 : !hal.device scf.if %3 { util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" } util.global.store %2#2, @__device_0 : !hal.device %__device_0 = util.global.load @__device_0 : !hal.device %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_0 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %4 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %5 = arith.cmpi eq, %4, %c0 : index %6 = scf.if %5 -> (!hal.executable) { %exe = hal.executable.create device(%__device_0_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable scf.yield %exe : !hal.executable } else { util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" scf.yield %0 : !hal.executable } util.global.store %6, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After ConvertAffineToStandard (lower-affine) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]> #translation = #iree_codegen.translation_info module { util.global private @__device_0 : !hal.device util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index %2:3 = scf.while (%arg0 = %c0, %arg1 = %c0, %arg2 = %1) : (index, index, !hal.device) -> (index, index, !hal.device) { %7 = util.cmp.eq %arg2, %1 : !hal.device %8 = arith.cmpi slt, %arg0, %device_count : index %9 = arith.andi %7, %8 : i1 scf.condition(%9) %arg0, %arg1, %arg2 : index, index, !hal.device } do { ^bb0(%arg0: index, %arg1: index, %arg2: !hal.device): %device_n = hal.devices.get %arg0 : !hal.device %ok_1, %value_2 = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false %7 = scf.if %value_2 -> (i1) { %ok_3, %value_4 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false scf.yield %value_4 : i1 } else { scf.yield %false : i1 } %8 = arith.cmpi eq, %arg1, %c0 : index %9 = arith.select %7, %c1, %c0 : index %10 = arith.addi %arg1, %9 : index %11 = arith.andi %7, %8 : i1 %12 = arith.select %11, %device_n, %1 : !hal.device %13 = arith.addi %arg0, %c1 : index scf.yield %13, %10, %12 : index, index, !hal.device } %3 = util.cmp.eq %2#2, %1 : !hal.device scf.if %3 { util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" } util.global.store %2#2, @__device_0 : !hal.device %__device_0 = util.global.load @__device_0 : !hal.device %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_0 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %4 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %5 = arith.cmpi eq, %4, %c0 : index %6 = scf.if %5 -> (!hal.executable) { %exe = hal.executable.create device(%__device_0_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable scf.yield %exe : !hal.executable } else { util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" scf.yield %0 : !hal.executable } util.global.store %6, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before SCFToControlFlow (convert-scf-to-cf) //----- // util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index %2:3 = scf.while (%arg0 = %c0, %arg1 = %c0, %arg2 = %1) : (index, index, !hal.device) -> (index, index, !hal.device) { %7 = util.cmp.eq %arg2, %1 : !hal.device %8 = arith.cmpi slt, %arg0, %device_count : index %9 = arith.andi %7, %8 : i1 scf.condition(%9) %arg0, %arg1, %arg2 : index, index, !hal.device } do { ^bb0(%arg0: index, %arg1: index, %arg2: !hal.device): %device_n = hal.devices.get %arg0 : !hal.device %ok_1, %value_2 = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false %7 = scf.if %value_2 -> (i1) { %ok_3, %value_4 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false scf.yield %value_4 : i1 } else { scf.yield %false : i1 } %8 = arith.cmpi eq, %arg1, %c0 : index %9 = arith.select %7, %c1, %c0 : index %10 = arith.addi %arg1, %9 : index %11 = arith.andi %7, %8 : i1 %12 = arith.select %11, %device_n, %1 : !hal.device %13 = arith.addi %arg0, %c1 : index scf.yield %13, %10, %12 : index, index, !hal.device } %3 = util.cmp.eq %2#2, %1 : !hal.device scf.if %3 { util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" } util.global.store %2#2, @__device_0 : !hal.device %__device_0 = util.global.load @__device_0 : !hal.device %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_0 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %4 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %5 = arith.cmpi eq, %4, %c0 : index %6 = scf.if %5 -> (!hal.executable) { %exe = hal.executable.create device(%__device_0_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable scf.yield %exe : !hal.executable } else { util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" scf.yield %0 : !hal.executable } util.global.store %6, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } // -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) //----- // util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb6 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2(%2, %3, %4 : index, index, !hal.device), ^bb7 ^bb2(%8: index, %9: index, %10: !hal.device): // pred: ^bb1 %device_n = hal.devices.get %8 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4 ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb5(%value_1 : i1) ^bb4: // pred: ^bb2 cf.br ^bb5(%false : i1) ^bb5(%11: i1): // 2 preds: ^bb3, ^bb4 cf.br ^bb6 ^bb6: // pred: ^bb5 %12 = arith.cmpi eq, %9, %c0 : index %13 = arith.select %11, %c1, %c0 : index %14 = arith.addi %9, %13 : index %15 = arith.andi %11, %12 : i1 %16 = arith.select %15, %device_n, %1 : !hal.device %17 = arith.addi %8, %c1 : index cf.br ^bb1(%17, %14, %16 : index, index, !hal.device) ^bb7: // pred: ^bb1 %18 = util.cmp.eq %4, %1 : !hal.device cf.cond_br %18, ^bb8, ^bb9 ^bb8: // pred: ^bb7 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb9 ^bb9: // 2 preds: ^bb7, ^bb8 util.global.store %4, @__device_0 : !hal.device %__device_0 = util.global.load @__device_0 : !hal.device %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_4 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0_4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %19 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %20 = arith.cmpi eq, %19, %c0 : index cf.cond_br %20, ^bb10, ^bb11 ^bb10: // pred: ^bb9 %exe = hal.executable.create device(%__device_0_4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb12(%exe : !hal.executable) ^bb11: // pred: ^bb9 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb12(%0 : !hal.executable) ^bb12(%21: !hal.executable): // 2 preds: ^bb10, ^bb11 cf.br ^bb13 ^bb13: // pred: ^bb12 util.global.store %21, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } // -----// IR Dump Before SerializeExecutablesPass (iree-hal-serialize-executables) //----- // hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } // -----// IR Dump Before SerializeTargetExecutablesPass (iree-hal-serialize-target-executables) //----- // hal.executable private @main_dispatch_0 { hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) { hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %c7 = arith.constant 7 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index hal.return %c7, %c4, %c1 : index, index, index } builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info} { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.undef : vector<3xf32> %2 = llvm.mlir.constant(1 : i64) : i64 %3 = llvm.mlir.constant(252 : index) : i64 %4 = llvm.mlir.constant(63 : index) : i64 %5 = llvm.mlir.constant(25 : index) : i64 %6 = llvm.mlir.constant(150 : index) : i64 %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>> %8 = llvm.mlir.constant(4 : index) : i64 %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>> %10 = llvm.mlir.constant(5 : index) : i64 %11 = llvm.mlir.constant(3 : index) : i64 %12 = llvm.mlir.constant(9 : index) : i64 %13 = llvm.mlir.constant(1 : index) : i64 %14 = llvm.mlir.constant(2 : index) : i64 %15 = llvm.mlir.constant(0 : index) : i64 %16 = llvm.mlir.constant(32 : i64) : i64 %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32 %20 = llvm.load %19 : !llvm.ptr -> i32 %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32 %22 = llvm.load %21 : !llvm.ptr -> i32 %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32 %24 = llvm.load %23 : !llvm.ptr -> i32 %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32 %26 = llvm.load %25 : !llvm.ptr -> i32 %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32 %28 = llvm.load %27 : !llvm.ptr -> i32 %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32 %30 = llvm.load %29 : !llvm.ptr -> i32 %31 = llvm.zext %20 : i32 to i64 %32 = llvm.zext %22 : i32 to i64 %33 = llvm.shl %32, %16 : i64 %34 = llvm.or %31, %33 : i64 %35 = llvm.zext %24 : i32 to i64 %36 = llvm.zext %26 : i32 to i64 %37 = llvm.shl %36, %16 : i64 %38 = llvm.or %35, %37 : i64 %39 = llvm.zext %28 : i32 to i64 %40 = llvm.zext %30 : i32 to i64 %41 = llvm.shl %40, %16 : i64 %42 = llvm.or %39, %41 : i64 %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 %47 = llvm.and %46, %4 : i64 %48 = llvm.icmp "eq" %47, %15 : i64 "llvm.intr.assume"(%48) : (i1) -> () %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr %53 = llvm.ptrtoint %52 : !llvm.ptr to i64 %54 = llvm.and %53, %4 : i64 %55 = llvm.icmp "eq" %54, %15 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr %59 = llvm.mul %42, %2 : i64 %60 = llvm.mul %59, %38 : i64 %61 = llvm.mul %60, %34 : i64 %62 = llvm.ptrtoint %58 : !llvm.ptr to i64 %63 = llvm.and %62, %4 : i64 %64 = llvm.icmp "eq" %63, %15 : i64 "llvm.intr.assume"(%64) : (i1) -> () %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %67 = llvm.zext %66 : i32 to i64 %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> %69 = llvm.zext %68 : i32 to i64 llvm.br ^bb1(%15 : i64) ^bb1(%70: i64): // 2 preds: ^bb0, ^bb8 %71 = llvm.icmp "slt" %70, %14 : i64 llvm.cond_br %71, ^bb2(%15 : i64), ^bb9 ^bb2(%72: i64): // 2 preds: ^bb1, ^bb7 %73 = llvm.icmp "slt" %72, %12 : i64 llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8 ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb2, ^bb6 %76 = llvm.icmp "slt" %74, %10 : i64 llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7 ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>): // 2 preds: ^bb3, ^bb5 %79 = llvm.icmp "slt" %77, %10 : i64 llvm.cond_br %79, ^bb5, ^bb6 ^bb5: // pred: ^bb4 %80 = llvm.add %67, %74 : i64 %81 = llvm.add %72, %77 : i64 %82 = llvm.mul %70, %61 : i64 %83 = llvm.mul %60, %15 : i64 %84 = llvm.add %82, %83 : i64 %85 = llvm.mul %80, %59 : i64 %86 = llvm.add %84, %85 : i64 %87 = llvm.add %86, %81 : i64 %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %90 = llvm.add %80, %13 : i64 %91 = llvm.mul %90, %59 : i64 %92 = llvm.add %84, %91 : i64 %93 = llvm.add %92, %81 : i64 %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %96 = llvm.add %80, %14 : i64 %97 = llvm.mul %96, %59 : i64 %98 = llvm.add %84, %97 : i64 %99 = llvm.add %98, %81 : i64 %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %102 = llvm.add %80, %11 : i64 %103 = llvm.mul %102, %59 : i64 %104 = llvm.add %84, %103 : i64 %105 = llvm.add %104, %81 : i64 %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %108 = llvm.add %80, %8 : i64 %109 = llvm.mul %108, %59 : i64 %110 = llvm.add %84, %109 : i64 %111 = llvm.add %110, %81 : i64 %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %114 = llvm.add %80, %10 : i64 %115 = llvm.mul %114, %59 : i64 %116 = llvm.add %84, %115 : i64 %117 = llvm.add %116, %81 : i64 %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32> %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %121 = llvm.mul %69, %6 : i64 %122 = llvm.mul %15, %5 : i64 %123 = llvm.add %121, %122 : i64 %124 = llvm.mul %74, %10 : i64 %125 = llvm.add %123, %124 : i64 %126 = llvm.add %125, %77 : i64 %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %128 = llvm.load %127 : !llvm.ptr -> f32 %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32> %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> %131 = llvm.intr.fmuladd(%89, %130, %120) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %132 = llvm.mul %13, %5 : i64 %133 = llvm.add %121, %132 : i64 %134 = llvm.add %133, %124 : i64 %135 = llvm.add %134, %77 : i64 %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %137 = llvm.load %136 : !llvm.ptr -> f32 %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32> %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> %140 = llvm.intr.fmuladd(%95, %139, %131) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %141 = llvm.mul %14, %5 : i64 %142 = llvm.add %121, %141 : i64 %143 = llvm.add %142, %124 : i64 %144 = llvm.add %143, %77 : i64 %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %146 = llvm.load %145 : !llvm.ptr -> f32 %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32> %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> %149 = llvm.intr.fmuladd(%101, %148, %140) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %150 = llvm.mul %11, %5 : i64 %151 = llvm.add %121, %150 : i64 %152 = llvm.add %151, %124 : i64 %153 = llvm.add %152, %77 : i64 %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %155 = llvm.load %154 : !llvm.ptr -> f32 %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32> %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> %158 = llvm.intr.fmuladd(%107, %157, %149) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %159 = llvm.mul %8, %5 : i64 %160 = llvm.add %121, %159 : i64 %161 = llvm.add %160, %124 : i64 %162 = llvm.add %161, %77 : i64 %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %164 = llvm.load %163 : !llvm.ptr -> f32 %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32> %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> %167 = llvm.intr.fmuladd(%113, %166, %158) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %168 = llvm.mul %10, %5 : i64 %169 = llvm.add %121, %168 : i64 %170 = llvm.add %169, %124 : i64 %171 = llvm.add %170, %77 : i64 %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %173 = llvm.load %172 : !llvm.ptr -> f32 %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32> %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> %176 = llvm.intr.fmuladd(%119, %175, %167) : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32> %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> %179 = llvm.add %77, %13 : i64 llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb6: // pred: ^bb4 %180 = llvm.add %74, %13 : i64 llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>) ^bb7: // pred: ^bb3 %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> %182 = llvm.mul %70, %3 : i64 %183 = llvm.mul %69, %4 : i64 %184 = llvm.add %182, %183 : i64 %185 = llvm.mul %67, %12 : i64 %186 = llvm.add %184, %185 : i64 %187 = llvm.add %186, %72 : i64 %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr %189 = llvm.add %72, %11 : i64 llvm.br ^bb2(%189 : i64) ^bb8: // pred: ^bb2 %190 = llvm.add %70, %13 : i64 llvm.br ^bb1(%190 : i64) ^bb9: // pred: ^bb1 llvm.return %0 : i32 } } } } // -----// IR Dump After SerializeTargetExecutablesPass (iree-hal-serialize-target-executables) //----- // hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } // -----// IR Dump After SerializeExecutablesPass (iree-hal-serialize-executables) //----- // hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } // -----// IR Dump Before SCFToControlFlow (convert-scf-to-cf) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } // -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } // -----// IR Dump Before PruneExecutablesPass (iree-hal-prune-executables) //----- // module { util.global private @__device_0 : !hal.device util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb6 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2(%2, %3, %4 : index, index, !hal.device), ^bb7 ^bb2(%8: index, %9: index, %10: !hal.device): // pred: ^bb1 %device_n = hal.devices.get %8 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4 ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb5(%value_1 : i1) ^bb4: // pred: ^bb2 cf.br ^bb5(%false : i1) ^bb5(%11: i1): // 2 preds: ^bb3, ^bb4 cf.br ^bb6 ^bb6: // pred: ^bb5 %12 = arith.cmpi eq, %9, %c0 : index %13 = arith.select %11, %c1, %c0 : index %14 = arith.addi %9, %13 : index %15 = arith.andi %11, %12 : i1 %16 = arith.select %15, %device_n, %1 : !hal.device %17 = arith.addi %8, %c1 : index cf.br ^bb1(%17, %14, %16 : index, index, !hal.device) ^bb7: // pred: ^bb1 %18 = util.cmp.eq %4, %1 : !hal.device cf.cond_br %18, ^bb8, ^bb9 ^bb8: // pred: ^bb7 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb9 ^bb9: // 2 preds: ^bb7, ^bb8 util.global.store %4, @__device_0 : !hal.device %__device_0 = util.global.load @__device_0 : !hal.device %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_4 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0_4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %19 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %20 = arith.cmpi eq, %19, %c0 : index cf.cond_br %20, ^bb10, ^bb11 ^bb10: // pred: ^bb9 %exe = hal.executable.create device(%__device_0_4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb12(%exe : !hal.executable) ^bb11: // pred: ^bb9 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb12(%0 : !hal.executable) ^bb12(%21: !hal.executable): // 2 preds: ^bb10, ^bb11 cf.br ^bb13 ^bb13: // pred: ^bb12 util.global.store %21, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After PruneExecutablesPass (iree-hal-prune-executables) //----- // module { util.global private @__device_0 : !hal.device util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb6 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2(%2, %3, %4 : index, index, !hal.device), ^bb7 ^bb2(%8: index, %9: index, %10: !hal.device): // pred: ^bb1 %device_n = hal.devices.get %8 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4 ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb5(%value_1 : i1) ^bb4: // pred: ^bb2 cf.br ^bb5(%false : i1) ^bb5(%11: i1): // 2 preds: ^bb3, ^bb4 cf.br ^bb6 ^bb6: // pred: ^bb5 %12 = arith.cmpi eq, %9, %c0 : index %13 = arith.select %11, %c1, %c0 : index %14 = arith.addi %9, %13 : index %15 = arith.andi %11, %12 : i1 %16 = arith.select %15, %device_n, %1 : !hal.device %17 = arith.addi %8, %c1 : index cf.br ^bb1(%17, %14, %16 : index, index, !hal.device) ^bb7: // pred: ^bb1 %18 = util.cmp.eq %4, %1 : !hal.device cf.cond_br %18, ^bb8, ^bb9 ^bb8: // pred: ^bb7 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb9 ^bb9: // 2 preds: ^bb7, ^bb8 util.global.store %4, @__device_0 : !hal.device %__device_0 = util.global.load @__device_0 : !hal.device %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_4 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0_4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %19 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %20 = arith.cmpi eq, %19, %c0 : index cf.cond_br %20, ^bb10, ^bb11 ^bb10: // pred: ^bb9 %exe = hal.executable.create device(%__device_0_4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb12(%exe : !hal.executable) ^bb11: // pred: ^bb9 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb12(%0 : !hal.executable) ^bb12(%21: !hal.executable): // 2 preds: ^bb10, ^bb11 cf.br ^bb13 ^bb13: // pred: ^bb12 util.global.store %21, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before SymbolDCE (symbol-dce) //----- // module { util.global private @__device_0 : !hal.device util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb6 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2(%2, %3, %4 : index, index, !hal.device), ^bb7 ^bb2(%8: index, %9: index, %10: !hal.device): // pred: ^bb1 %device_n = hal.devices.get %8 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4 ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb5(%value_1 : i1) ^bb4: // pred: ^bb2 cf.br ^bb5(%false : i1) ^bb5(%11: i1): // 2 preds: ^bb3, ^bb4 cf.br ^bb6 ^bb6: // pred: ^bb5 %12 = arith.cmpi eq, %9, %c0 : index %13 = arith.select %11, %c1, %c0 : index %14 = arith.addi %9, %13 : index %15 = arith.andi %11, %12 : i1 %16 = arith.select %15, %device_n, %1 : !hal.device %17 = arith.addi %8, %c1 : index cf.br ^bb1(%17, %14, %16 : index, index, !hal.device) ^bb7: // pred: ^bb1 %18 = util.cmp.eq %4, %1 : !hal.device cf.cond_br %18, ^bb8, ^bb9 ^bb8: // pred: ^bb7 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb9 ^bb9: // 2 preds: ^bb7, ^bb8 util.global.store %4, @__device_0 : !hal.device %__device_0 = util.global.load @__device_0 : !hal.device %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_4 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0_4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %19 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %20 = arith.cmpi eq, %19, %c0 : index cf.cond_br %20, ^bb10, ^bb11 ^bb10: // pred: ^bb9 %exe = hal.executable.create device(%__device_0_4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb12(%exe : !hal.executable) ^bb11: // pred: ^bb9 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb12(%0 : !hal.executable) ^bb12(%21: !hal.executable): // 2 preds: ^bb10, ^bb11 cf.br ^bb13 ^bb13: // pred: ^bb12 util.global.store %21, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After SymbolDCE (symbol-dce) //----- // module { util.global private @__device_0 : !hal.device util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb6 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2(%2, %3, %4 : index, index, !hal.device), ^bb7 ^bb2(%8: index, %9: index, %10: !hal.device): // pred: ^bb1 %device_n = hal.devices.get %8 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4 ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb5(%value_1 : i1) ^bb4: // pred: ^bb2 cf.br ^bb5(%false : i1) ^bb5(%11: i1): // 2 preds: ^bb3, ^bb4 cf.br ^bb6 ^bb6: // pred: ^bb5 %12 = arith.cmpi eq, %9, %c0 : index %13 = arith.select %11, %c1, %c0 : index %14 = arith.addi %9, %13 : index %15 = arith.andi %11, %12 : i1 %16 = arith.select %15, %device_n, %1 : !hal.device %17 = arith.addi %8, %c1 : index cf.br ^bb1(%17, %14, %16 : index, index, !hal.device) ^bb7: // pred: ^bb1 %18 = util.cmp.eq %4, %1 : !hal.device cf.cond_br %18, ^bb8, ^bb9 ^bb8: // pred: ^bb7 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb9 ^bb9: // 2 preds: ^bb7, ^bb8 util.global.store %4, @__device_0 : !hal.device %__device_0 = util.global.load @__device_0 : !hal.device %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_4 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0_4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %19 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %20 = arith.cmpi eq, %19, %c0 : index cf.cond_br %20, ^bb10, ^bb11 ^bb10: // pred: ^bb9 %exe = hal.executable.create device(%__device_0_4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb12(%exe : !hal.executable) ^bb11: // pred: ^bb9 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb12(%0 : !hal.executable) ^bb12(%21: !hal.executable): // 2 preds: ^bb10, ^bb11 cf.br ^bb13 ^bb13: // pred: ^bb12 util.global.store %21, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before FixedPointIterator (iree-util-fixed-point-iterator) //----- // module { util.global private @__device_0 : !hal.device util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb6 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2(%2, %3, %4 : index, index, !hal.device), ^bb7 ^bb2(%8: index, %9: index, %10: !hal.device): // pred: ^bb1 %device_n = hal.devices.get %8 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4 ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb5(%value_1 : i1) ^bb4: // pred: ^bb2 cf.br ^bb5(%false : i1) ^bb5(%11: i1): // 2 preds: ^bb3, ^bb4 cf.br ^bb6 ^bb6: // pred: ^bb5 %12 = arith.cmpi eq, %9, %c0 : index %13 = arith.select %11, %c1, %c0 : index %14 = arith.addi %9, %13 : index %15 = arith.andi %11, %12 : i1 %16 = arith.select %15, %device_n, %1 : !hal.device %17 = arith.addi %8, %c1 : index cf.br ^bb1(%17, %14, %16 : index, index, !hal.device) ^bb7: // pred: ^bb1 %18 = util.cmp.eq %4, %1 : !hal.device cf.cond_br %18, ^bb8, ^bb9 ^bb8: // pred: ^bb7 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb9 ^bb9: // 2 preds: ^bb7, ^bb8 util.global.store %4, @__device_0 : !hal.device %__device_0 = util.global.load @__device_0 : !hal.device %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_4 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0_4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %19 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %20 = arith.cmpi eq, %19, %c0 : index cf.cond_br %20, ^bb10, ^bb11 ^bb10: // pred: ^bb9 %exe = hal.executable.create device(%__device_0_4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb12(%exe : !hal.executable) ^bb11: // pred: ^bb9 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb12(%0 : !hal.executable) ^bb12(%21: !hal.executable): // 2 preds: ^bb10, ^bb11 cf.br ^bb13 ^bb13: // pred: ^bb12 util.global.store %21, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before CSE (cse) //----- // module attributes {iree.fixedpoint.iteration = 0 : index} { util.global private @__device_0 : !hal.device util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb6 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2(%2, %3, %4 : index, index, !hal.device), ^bb7 ^bb2(%8: index, %9: index, %10: !hal.device): // pred: ^bb1 %device_n = hal.devices.get %8 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4 ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb5(%value_1 : i1) ^bb4: // pred: ^bb2 cf.br ^bb5(%false : i1) ^bb5(%11: i1): // 2 preds: ^bb3, ^bb4 cf.br ^bb6 ^bb6: // pred: ^bb5 %12 = arith.cmpi eq, %9, %c0 : index %13 = arith.select %11, %c1, %c0 : index %14 = arith.addi %9, %13 : index %15 = arith.andi %11, %12 : i1 %16 = arith.select %15, %device_n, %1 : !hal.device %17 = arith.addi %8, %c1 : index cf.br ^bb1(%17, %14, %16 : index, index, !hal.device) ^bb7: // pred: ^bb1 %18 = util.cmp.eq %4, %1 : !hal.device cf.cond_br %18, ^bb8, ^bb9 ^bb8: // pred: ^bb7 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb9 ^bb9: // 2 preds: ^bb7, ^bb8 util.global.store %4, @__device_0 : !hal.device %__device_0 = util.global.load @__device_0 : !hal.device %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_4 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0_4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %19 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %20 = arith.cmpi eq, %19, %c0 : index cf.cond_br %20, ^bb10, ^bb11 ^bb10: // pred: ^bb9 %exe = hal.executable.create device(%__device_0_4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb12(%exe : !hal.executable) ^bb11: // pred: ^bb9 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb12(%0 : !hal.executable) ^bb12(%21: !hal.executable): // 2 preds: ^bb10, ^bb11 cf.br ^bb13 ^bb13: // pred: ^bb12 util.global.store %21, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After CSE (cse) //----- // module attributes {iree.fixedpoint.iteration = 0 : index} { util.global private @__device_0 : !hal.device util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb6 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2(%2, %3, %4 : index, index, !hal.device), ^bb7 ^bb2(%8: index, %9: index, %10: !hal.device): // pred: ^bb1 %device_n = hal.devices.get %8 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4 ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb5(%value_1 : i1) ^bb4: // pred: ^bb2 cf.br ^bb5(%false : i1) ^bb5(%11: i1): // 2 preds: ^bb3, ^bb4 cf.br ^bb6 ^bb6: // pred: ^bb5 %12 = arith.cmpi eq, %9, %c0 : index %13 = arith.select %11, %c1, %c0 : index %14 = arith.addi %9, %13 : index %15 = arith.andi %11, %12 : i1 %16 = arith.select %15, %device_n, %1 : !hal.device %17 = arith.addi %8, %c1 : index cf.br ^bb1(%17, %14, %16 : index, index, !hal.device) ^bb7: // pred: ^bb1 cf.cond_br %5, ^bb8, ^bb9 ^bb8: // pred: ^bb7 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb9 ^bb9: // 2 preds: ^bb7, ^bb8 util.global.store %4, @__device_0 : !hal.device %__device_0 = util.global.load @__device_0 : !hal.device %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_4 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0_4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %18 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %19 = arith.cmpi eq, %18, %c0 : index cf.cond_br %19, ^bb10, ^bb11 ^bb10: // pred: ^bb9 %exe = hal.executable.create device(%__device_0_4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb12(%exe : !hal.executable) ^bb11: // pred: ^bb9 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb12(%0 : !hal.executable) ^bb12(%20: !hal.executable): // 2 preds: ^bb10, ^bb11 cf.br ^bb13 ^bb13: // pred: ^bb12 util.global.store %20, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // module attributes {iree.fixedpoint.iteration = 0 : index} { util.global private @__device_0 : !hal.device util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb6 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2(%2, %3, %4 : index, index, !hal.device), ^bb7 ^bb2(%8: index, %9: index, %10: !hal.device): // pred: ^bb1 %device_n = hal.devices.get %8 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4 ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb5(%value_1 : i1) ^bb4: // pred: ^bb2 cf.br ^bb5(%false : i1) ^bb5(%11: i1): // 2 preds: ^bb3, ^bb4 cf.br ^bb6 ^bb6: // pred: ^bb5 %12 = arith.cmpi eq, %9, %c0 : index %13 = arith.select %11, %c1, %c0 : index %14 = arith.addi %9, %13 : index %15 = arith.andi %11, %12 : i1 %16 = arith.select %15, %device_n, %1 : !hal.device %17 = arith.addi %8, %c1 : index cf.br ^bb1(%17, %14, %16 : index, index, !hal.device) ^bb7: // pred: ^bb1 cf.cond_br %5, ^bb8, ^bb9 ^bb8: // pred: ^bb7 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb9 ^bb9: // 2 preds: ^bb7, ^bb8 util.global.store %4, @__device_0 : !hal.device %__device_0 = util.global.load @__device_0 : !hal.device %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_4 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0_4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %18 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %19 = arith.cmpi eq, %18, %c0 : index cf.cond_br %19, ^bb10, ^bb11 ^bb10: // pred: ^bb9 %exe = hal.executable.create device(%__device_0_4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb12(%exe : !hal.executable) ^bb11: // pred: ^bb9 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb12(%0 : !hal.executable) ^bb12(%20: !hal.executable): // 2 preds: ^bb10, ^bb11 cf.br ^bb13 ^bb13: // pred: ^bb12 util.global.store %20, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After Canonicalizer (canonicalize) //----- // module attributes {iree.fixedpoint.iteration = 0 : index} { util.global private @__device_0 : !hal.device util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2(%2, %3 : index, index), ^bb5 ^bb2(%8: index, %9: index): // pred: ^bb1 %device_n = hal.devices.get %8 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%10: i1): // 2 preds: ^bb2, ^bb3 %11 = arith.cmpi eq, %9, %c0 : index %12 = arith.select %10, %c1, %c0 : index %13 = arith.addi %9, %12 : index %14 = arith.andi %10, %11 : i1 %15 = arith.select %14, %device_n, %1 : !hal.device %16 = arith.addi %8, %c1 : index cf.br ^bb1(%16, %13, %15 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 util.global.store %4, @__device_0 : !hal.device %__device_0 = util.global.load @__device_0 : !hal.device %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_4 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0_4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %17 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %18 = arith.cmpi eq, %17, %c0 : index cf.cond_br %18, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%__device_0_4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%19: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %19, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before CSE (cse) //----- // module attributes {iree.fixedpoint.iteration = 0 : index} { util.global private @__device_0 : !hal.device util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2(%2, %3 : index, index), ^bb5 ^bb2(%8: index, %9: index): // pred: ^bb1 %device_n = hal.devices.get %8 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%10: i1): // 2 preds: ^bb2, ^bb3 %11 = arith.cmpi eq, %9, %c0 : index %12 = arith.select %10, %c1, %c0 : index %13 = arith.addi %9, %12 : index %14 = arith.andi %10, %11 : i1 %15 = arith.select %14, %device_n, %1 : !hal.device %16 = arith.addi %8, %c1 : index cf.br ^bb1(%16, %13, %15 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 util.global.store %4, @__device_0 : !hal.device %__device_0 = util.global.load @__device_0 : !hal.device %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_4 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0_4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %17 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %18 = arith.cmpi eq, %17, %c0 : index cf.cond_br %18, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%__device_0_4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%19: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %19, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After CSE (cse) //----- // module attributes {iree.fixedpoint.iteration = 0 : index} { util.global private @__device_0 : !hal.device util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2(%2, %3 : index, index), ^bb5 ^bb2(%8: index, %9: index): // pred: ^bb1 %device_n = hal.devices.get %8 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%10: i1): // 2 preds: ^bb2, ^bb3 %11 = arith.cmpi eq, %9, %c0 : index %12 = arith.select %10, %c1, %c0 : index %13 = arith.addi %9, %12 : index %14 = arith.andi %10, %11 : i1 %15 = arith.select %14, %device_n, %1 : !hal.device %16 = arith.addi %8, %c1 : index cf.br ^bb1(%16, %13, %15 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 util.global.store %4, @__device_0 : !hal.device %__device_0 = util.global.load @__device_0 : !hal.device %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_4 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0_4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %17 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %18 = arith.cmpi eq, %17, %c0 : index cf.cond_br %18, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%__device_0_4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%19: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %19, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2(%2, %3 : index, index), ^bb5 ^bb2(%8: index, %9: index): // pred: ^bb1 %device_n = hal.devices.get %8 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%10: i1): // 2 preds: ^bb2, ^bb3 %11 = arith.cmpi eq, %9, %c0 : index %12 = arith.select %10, %c1, %c0 : index %13 = arith.addi %9, %12 : index %14 = arith.andi %10, %11 : i1 %15 = arith.select %14, %device_n, %1 : !hal.device %16 = arith.addi %8, %c1 : index cf.br ^bb1(%16, %13, %15 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 util.global.store %4, @__device_0 : !hal.device %__device_0 = util.global.load @__device_0 : !hal.device %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 %__device_0_4 = util.global.load @__device_0 : !hal.device %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%__device_0_4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %17 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index %18 = arith.cmpi eq, %17, %c0 : index cf.cond_br %18, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%__device_0_4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%19: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %19, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2(%2, %3 : index, index), ^bb5 ^bb2(%8: index, %9: index): // pred: ^bb1 %device_n = hal.devices.get %8 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%10: i1): // 2 preds: ^bb2, ^bb3 %11 = arith.cmpi eq, %9, %c0 : index %12 = arith.select %10, %c1, %c0 : index %13 = arith.addi %9, %12 : index %14 = arith.andi %10, %11 : i1 %15 = arith.select %14, %device_n, %1 : !hal.device %16 = arith.addi %8, %c1 : index cf.br ^bb1(%16, %13, %15 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %17 = arith.select %value_3, %c0, %c-1 : index %18 = arith.cmpi eq, %17, %c0 : index util.global.store %4, @__device_0 : !hal.device util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 cf.cond_br %18, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%19: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %19, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } // -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } // -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- // module attributes {iree.fixedpoint.iteration = 0 : index} { util.global private @__device_0 : !hal.device util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2(%2, %3 : index, index), ^bb5 ^bb2(%8: index, %9: index): // pred: ^bb1 %device_n = hal.devices.get %8 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%10: i1): // 2 preds: ^bb2, ^bb3 %11 = arith.cmpi eq, %9, %c0 : index %12 = arith.select %10, %c1, %c0 : index %13 = arith.addi %9, %12 : index %14 = arith.andi %10, %11 : i1 %15 = arith.select %14, %device_n, %1 : !hal.device %16 = arith.addi %8, %c1 : index cf.br ^bb1(%16, %13, %15 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %17 = arith.select %value_3, %c0, %c-1 : index %18 = arith.cmpi eq, %17, %c0 : index util.global.store %4, @__device_0 : !hal.device util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 cf.cond_br %18, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%19: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %19, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // module attributes {iree.fixedpoint.iteration = 0 : index} { util.global private @__device_0 : !hal.device util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- // module attributes {iree.fixedpoint.iteration = 0 : index} { util.global private @__device_0 : !hal.device util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1 cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // module attributes {iree.fixedpoint.iteration = 0 : index} { util.global private @__device_0 : !hal.device util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- // module attributes {iree.fixedpoint.iteration = 0 : index} { util.global private @__device_0 : !hal.device util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // module attributes {iree.fixedpoint.iteration = 0 : index} { util.global private @__device_0 : !hal.device util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before IPO (iree-util-ipo) //----- // module attributes {iree.fixedpoint.iteration = 0 : index} { util.global private @__device_0 : !hal.device util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After IPO (iree-util-ipo) //----- // module attributes {iree.fixedpoint.iteration = 0 : index} { util.global private @__device_0 : !hal.device util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After FixedPointIterator (iree-util-fixed-point-iterator) //----- // module { util.global private @__device_0 : !hal.device util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before Inliner (inline) //----- // module { util.global private @__device_0 : !hal.device util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } // -----// IR Dump After Canonicalizer (canonicalize) //----- // util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } // -----// IR Dump After Canonicalizer (canonicalize) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } // -----// IR Dump After Inliner (inline) //----- // module { util.global private @__device_0 : !hal.device util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before SymbolDCE (symbol-dce) //----- // module { util.global private @__device_0 : !hal.device util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After SymbolDCE (symbol-dce) //----- // module { util.global private @__device_0 : !hal.device util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before SCFForLoopCanonicalization (scf-for-loop-canonicalization) //----- // util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } // -----// IR Dump After SCFForLoopCanonicalization (scf-for-loop-canonicalization) //----- // util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } // -----// IR Dump Before LoopInvariantCodeMotion (loop-invariant-code-motion) //----- // util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } // -----// IR Dump After LoopInvariantCodeMotion (loop-invariant-code-motion) //----- // util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } // -----// IR Dump Before SCFToControlFlow (convert-scf-to-cf) //----- // util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } // -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) //----- // util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } // -----// IR Dump Before AffineExpandIndexOps (affine-expand-index-ops) //----- // util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } // -----// IR Dump After AffineExpandIndexOps (affine-expand-index-ops) //----- // util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } // -----// IR Dump Before ConvertAffineToStandard (lower-affine) //----- // util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } // -----// IR Dump After ConvertAffineToStandard (lower-affine) //----- // util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } // -----// IR Dump Before ArithUnsignedWhenEquivalent (arith-unsigned-when-equivalent) //----- // util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } // -----// IR Dump After ArithUnsignedWhenEquivalent (arith-unsigned-when-equivalent) //----- // util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } // -----// IR Dump Before SCFForLoopCanonicalization (scf-for-loop-canonicalization) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } // -----// IR Dump After SCFForLoopCanonicalization (scf-for-loop-canonicalization) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } // -----// IR Dump Before LoopInvariantCodeMotion (loop-invariant-code-motion) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } // -----// IR Dump After LoopInvariantCodeMotion (loop-invariant-code-motion) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } // -----// IR Dump Before SCFToControlFlow (convert-scf-to-cf) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } // -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } // -----// IR Dump Before AffineExpandIndexOps (affine-expand-index-ops) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } // -----// IR Dump After AffineExpandIndexOps (affine-expand-index-ops) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } // -----// IR Dump Before ConvertAffineToStandard (lower-affine) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } // -----// IR Dump After ConvertAffineToStandard (lower-affine) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } // -----// IR Dump Before ArithUnsignedWhenEquivalent (arith-unsigned-when-equivalent) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } // -----// IR Dump After ArithUnsignedWhenEquivalent (arith-unsigned-when-equivalent) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } // -----// IR Dump Before PropagateSubranges (iree-util-propagate-subranges) //----- // module { util.global private @__device_0 : !hal.device util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After PropagateSubranges (iree-util-propagate-subranges) //----- // module { util.global private @__device_0 : !hal.device util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // module { util.global private @__device_0 : !hal.device util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After Canonicalizer (canonicalize) //----- // module { util.global private @__device_0 : !hal.device util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before CSE (cse) //----- // module { util.global private @__device_0 : !hal.device util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After CSE (cse) //----- // module { util.global private @__device_0 : !hal.device util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } // -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } // -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- // module { util.global private @__device_0 : !hal.device util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %c6 = arith.constant 6 : index %c4 = arith.constant 4 : index %c5 = arith.constant 5 : index %c0 = arith.constant 0 : index %c2400 = arith.constant 2400 : index %c2016 = arith.constant 2016 : index %c32_i64 = arith.constant 32 : i64 %c-1_i64 = arith.constant -1 : i64 %0 = util.null : !hal.fence %c0_i64 = arith.constant 0 : i64 %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c7 = arith.constant 7 : index %c-1_i32 = arith.constant -1 : i32 %c9 = arith.constant 9 : index %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // module { util.global private @__device_0 : !hal.device util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- // module { util.global private @__device_0 : !hal.device util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // module { util.global private @__device_0 : !hal.device util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- // module { util.global private @__device_0 : !hal.device util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // module { util.global private @__device_0 : !hal.device util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump Before mlir::iree_compiler::IREE::VM::ConversionPass (iree-vm-conversion) //----- // module { util.global private @__device_0 : !hal.device util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable util.initializer { %0 = util.null : !hal.executable %c14_i32 = arith.constant 14 : i32 %c-1 = arith.constant -1 : index %c5_i32 = arith.constant 5 : i32 %false = arith.constant false %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %1 = util.null : !hal.device %device_count = hal.devices.count : index cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 %5 = util.cmp.eq %4, %1 : !hal.device %6 = arith.cmpi slt, %2, %device_count : index %7 = arith.andi %5, %6 : i1 cf.cond_br %7, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %device_n = hal.devices.get %2 : !hal.device %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false cf.cond_br %value, ^bb3, ^bb4(%false : i1) ^bb3: // pred: ^bb2 %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false cf.br ^bb4(%value_1 : i1) ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 %9 = arith.cmpi eq, %3, %c0 : index %10 = arith.select %8, %c1, %c0 : index %11 = arith.addi %3, %10 : index %12 = arith.andi %8, %9 : i1 %13 = arith.select %12, %device_n, %1 : !hal.device %14 = arith.addi %2, %c1 : index cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) ^bb5: // pred: ^bb1 cf.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" cf.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout %15 = arith.select %value_3, %c0, %c-1 : index %16 = arith.cmpi eq, %15, %c0 : index util.global.store %4, @__device_0 : !hal.device cf.cond_br %16, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable cf.br ^bb10(%exe : !hal.executable) ^bb9: // pred: ^bb7 util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" cf.br ^bb10(%0 : !hal.executable) ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout util.return } hal.executable private @main_dispatch_0 { hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"} } util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = arith.constant 9 : index %c-1_i32 = arith.constant -1 : i32 %c7 = arith.constant 7 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0_i64 = arith.constant 0 : i64 %0 = util.null : !hal.fence %c-1_i64 = arith.constant -1 : i64 %c32_i64 = arith.constant 32 : i64 %c2016 = arith.constant 2016 : index %c2400 = arith.constant 2400 : index %c0 = arith.constant 0 : index %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index %c6 = arith.constant 6 : index %__device_0 = util.global.load immutable @__device_0 : !hal.device %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index %element_type_f32 = hal.element_type : i32 %dense_row_major = hal.encoding_type : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major) %5 = arith.muli %1, %c4 : index %6 = arith.muli %5, %2 : index %7 = arith.muli %6, %3 : index %8 = arith.muli %7, %4 : index %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major) %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016} %9 = arith.index_castui %1 : index to i64 %10 = arith.trunci %9 : i64 to i32 %11 = arith.shrui %9, %c32_i64 : i64 %12 = arith.trunci %11 : i64 to i32 %13 = arith.index_castui %2 : index to i64 %14 = arith.trunci %13 : i64 to i32 %15 = arith.shrui %13, %c32_i64 : i64 %16 = arith.trunci %15 : i64 to i32 %17 = arith.index_castui %3 : index to i64 %18 = arith.trunci %17 : i64 to i32 %19 = arith.shrui %17, %c32_i64 : i64 %20 = arith.trunci %19 : i64 to i32 %21 = arith.index_castui %4 : index to i64 %22 = arith.trunci %21 : i64 to i32 %23 = arith.shrui %21, %c32_i64 : i64 %24 = arith.trunci %23 : i64 to i32 %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32 hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ %c0 = (%buffer : !hal.buffer)[%c0, %8], %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016] ]) hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None") hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") hal.command_buffer.finalize<%cmd : !hal.command_buffer> %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd]) %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32 util.status.check_ok %status, "failed to wait on timepoint" %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view util.return %view : !hal.buffer_view } } // -----// IR Dump After mlir::iree_compiler::IREE::VM::ConversionPass (iree-vm-conversion) //----- // module attributes {vm.toplevel} { vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.initializer { %null = vm.const.ref.zero : !vm.ref %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1 = vm.const.i64 1 %null_1 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_1 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %req = vm.cmp.eq.ref %4, %null_1 : !vm.ref %slt = vm.cmp.lt.i64.s %2, %1 : i64 %5 = vm.and.i32 %req, %slt : i32 vm.cond_br %5, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %6 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%6) {nosideeffects} : (i32) -> !vm.ref %buffer = vm.rodata.inline "_utf8_hal_device_id_D0F1B3E9D63E707C" {alignment = 1 : i64} : !vm.buffer = "hal.device.id" %buffer_2 = vm.rodata.inline "_utf8_local_8DC315A014BAFA34" {alignment = 1 : i64} : !vm.buffer = "local*" %7:2 = vm.call @hal.device.query.i64(%ref, %buffer, %buffer_2) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %7#1 : i64 %zero_3 = vm.const.i32.zero %8 = vm.select.i32 %7#0, %nz, %zero_3 : i32 %c1_4 = vm.const.i32 1 vm.cond_br %8, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %buffer_5 = vm.rodata.inline "_utf8_hal_executable_format_1F9665C75F0004D3" {alignment = 1 : i64} : !vm.buffer = "hal.executable.format" %buffer_6 = vm.rodata.inline "_utf8_embedded_elf_x86_64_11EF7D6636570B50" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-x86_64" %9:2 = vm.call @hal.device.query.i64(%ref, %buffer_5, %buffer_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %9#1 : i64 %zero_8 = vm.const.i32.zero %10 = vm.select.i32 %9#0, %nz_7, %zero_8 : i32 %c1_9 = vm.const.i32 1 vm.br ^bb4(%10 : i32) ^bb4(%11: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %12 = vm.select.i64 %11, %c1, %zero_0 : i64 %13 = vm.add.i64 %3, %12 : i64 %14 = vm.and.i32 %11, %eq : i32 %ref_10 = vm.select.ref %14, %ref, %null_1 : !vm.ref %15 = vm.add.i64 %2, %c1 : i64 vm.br ^bb1(%15, %13, %ref_10 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %req, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.cond_fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]>" vm.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %buffer_11 = vm.rodata.inline "_utf8_hal_executable_format_1F9665C75F0004D3" {alignment = 1 : i64} : !vm.buffer = "hal.executable.format" %buffer_12 = vm.rodata.inline "_utf8_embedded_elf_x86_64_11EF7D6636570B50" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-x86_64" %16:2 = vm.call @hal.device.query.i64(%4, %buffer_11, %buffer_12) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_13 = vm.cmp.nz.i64 %16#1 : i64 %zero_14 = vm.const.i32.zero %17 = vm.select.i32 %16#0, %nz_13, %zero_14 : i32 %c1_15 = vm.const.i32 1 %c1_16 = vm.const.i32 1 %zero_17 = vm.const.i32.zero %c7 = vm.const.i32 7 %c3 = vm.const.i32 3 %c1_18 = vm.const.i32 1 %c7_19 = vm.const.i32 7 %c3_20 = vm.const.i32 3 %c2 = vm.const.i32 2 %c7_21 = vm.const.i32 7 %c2_22 = vm.const.i32 2 %ref_23 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1_16, [(%zero_17, %c7, %c3), (%c1_18, %c7_19, %c3_20), (%c2, %c7_21, %c2_22)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %c8 = vm.const.i32 8 %ref_24 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_23]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %18 = vm.select.i64 %17, %zero_0, %c-1 : i64 %eq_25 = vm.cmp.eq.i64 %18, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_25, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %buffer_26 = vm.rodata.inline "main_dispatch_0_embedded_elf_x86_64" {alignment = 16 : i64, mime_type = "application/x-elf"} : !vm.buffer = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> %buffer_27 = vm.rodata.inline "_utf8_embedded_elf_x86_64_11EF7D6636570B50" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-x86_64" %null_28 = vm.const.ref.zero : !vm.buffer %ref_29 = vm.call.variadic @hal.executable.create(%4, %buffer_27, %buffer_26, %null_28, [%ref_24]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.br ^bb10(%ref_29 : !vm.ref) ^bb9: // pred: ^bb7 vm.cond_fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" vm.br ^bb10(%null : !vm.ref) ^bb10(%19: !vm.ref): // 2 preds: ^bb8, ^bb9 vm.global.store.ref %19, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_24, @__device_0_pipeline_layout_0 : !vm.ref vm.return } vm.import private @hal.ex.file.from_memory(%device : !vm.ref, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref vm.import private @hal.allocator.allocate(%allocator : !vm.ref, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.allocator.import(%allocator : !vm.ref, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref, %source_offset : i64, %length : i64) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer.length(%buffer : !vm.ref) -> i64 attributes {nosideeffects} vm.import private @hal.buffer.load(%source_buffer : !vm.ref, %source_offset : i64, %length : i32) -> i32 vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref, %target_offset : i64, %length : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref ...) vm.import private @hal.channel.create(%device : !vm.ref, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.split(%channel : !vm.ref, %color : i32, %key : i32, %flags : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.rank_and_count(%channel : !vm.ref) -> (i32, i32) attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref, %label : !vm.buffer) vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32) vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32) vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64) vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref, %channel : !vm.ref, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref, %recv_buffer : !vm.ref, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64) vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.dealloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %buffer : !vm.ref) vm.import private @hal.device.queue.read(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_file : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.write(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.ref, %source_offset : i64, %target_file : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %binding_table : tuple, i64, i64> ...) vm.import private @hal.device.queue.flush(%device : !vm.ref, %queue_affinity : i64) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create2(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {minimum_version = 4 : i32, nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.join(%fences : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.query(%fence : !vm.ref) -> i32 vm.import private @hal.fence.signal(%fence : !vm.ref) vm.import private @hal.fence.fail(%fence : !vm.ref, %status : i32) vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7 = vm.const.i64 7 %c2 = vm.const.i64 2 %c1 = vm.const.i64 1 %zero = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_0 = vm.const.i64 -1 %c32 = vm.const.i64 32 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %zero_1 = vm.const.i64.zero %c5 = vm.const.i64 5 %c4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %zero_2 = vm.const.i32.zero %0 = vm.call @hal.buffer_view.dim(%arg0, %zero_2) {nosideeffects} : (!vm.ref, i32) -> i64 %c1_3 = vm.const.i32 1 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1_3) {nosideeffects} : (!vm.ref, i32) -> i64 %c2_4 = vm.const.i32 2 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2_4) {nosideeffects} : (!vm.ref, i32) -> i64 %c3 = vm.const.i32 3 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %c553648160 = vm.const.i32 553648160 %c1_5 = vm.const.i32 1 %buffer = vm.rodata.inline "_utf8_input0_DA9A70D360954439" {alignment = 1 : i64} : !vm.buffer = "input0" vm.call.variadic @hal.buffer_view.assert(%arg0, %buffer, %c553648160, %c1_5, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_6 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %buffer_7 = vm.rodata.inline "_utf8_tensor_41A152EEDB094D7A" {alignment = 1 : i64} : !vm.buffer = "tensor" %c16 = vm.const.i32 16 %c3075 = vm.const.i32 3075 vm.call @hal.buffer.assert(%ref, %buffer_7, %ref_6, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %buffer_8 = vm.rodata.inline "_utf8_input1_FDCC539DA203DDD3" {alignment = 1 : i64} : !vm.buffer = "input1" vm.call.variadic @hal.buffer_view.assert(%arg1, %buffer_8, %c553648160, %c1_5, [%c4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_9 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref %buffer_10 = vm.rodata.inline "_utf8_tensor_41A152EEDB094D7A" {alignment = 1 : i64} : !vm.buffer = "tensor" %c16_11 = vm.const.i32 16 %c3075_12 = vm.const.i32 3075 vm.call @hal.buffer.assert(%ref_9, %buffer_10, %ref_6, %c2400, %c16_11, %c3075_12) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %zero_13 = vm.const.i32.zero %ref_14 = vm.call @hal.fence.create(%__device_0, %zero_13) : (!vm.ref, i32) -> !vm.ref %zero_15 = vm.const.i32.zero %c48 = vm.const.i32 48 %c3075_16 = vm.const.i32 3075 %ref_17 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_0, %null, %ref_14, %zero_15, %c48, %c3075_16, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %c32_18 = vm.const.i32 32 %9 = vm.shr.i64.u %0, %c32_18 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %c32_19 = vm.const.i32 32 %12 = vm.shr.i64.u %1, %c32_19 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %c32_20 = vm.const.i32 32 %15 = vm.shr.i64.u %2, %c32_20 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %c32_21 = vm.const.i32 32 %18 = vm.shr.i64.u %3, %c32_21 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %c1_22 = vm.const.i32 1 %c3_23 = vm.const.i32 3 %zero_24 = vm.const.i32.zero %ref_25 = vm.call @hal.command_buffer.create(%__device_0, %c1_22, %c3_23, %c-1_0, %zero_24) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref %zero_26 = vm.const.i32.zero vm.call.variadic @hal.command_buffer.push_constants(%ref_25, %__device_0_pipeline_layout_0, %zero_26, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) %zero_27 = vm.const.i32.zero %zero_28 = vm.const.i32.zero %zero_29 = vm.const.i32.zero %c1_30 = vm.const.i32 1 %zero_31 = vm.const.i32.zero %c2_32 = vm.const.i32 2 %zero_33 = vm.const.i32.zero vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_25, %__device_0_pipeline_layout_0, %zero_27, [(%zero_28, %zero_29, %ref, %zero_1, %7), (%c1_30, %zero_31, %ref_9, %zero_1, %c2400), (%c2_32, %zero_33, %ref_17, %zero_1, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) %zero_34 = vm.const.i32.zero %c7_35 = vm.const.i32 7 %c4_36 = vm.const.i32 4 %c1_37 = vm.const.i32 1 %zero_38 = vm.const.i64.zero vm.call @hal.command_buffer.dispatch(%ref_25, %__device_0_executable_0_main_dispatch_0, %zero_34, %c7_35, %c4_36, %c1_37, %zero_38) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () %c28 = vm.const.i32 28 %c13 = vm.const.i32 13 %zero_39 = vm.const.i32.zero vm.call @hal.command_buffer.execution_barrier(%ref_25, %c28, %c13, %zero_39) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_25) : (!vm.ref) -> () %zero_40 = vm.const.i32.zero %ref_41 = vm.call @hal.fence.create(%__device_0, %zero_40) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_0, %ref_14, %ref_41, [%ref_25]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_41]) : (i32, !vm.ref ...) -> i32 vm.cond_fail %20, "failed to wait on timepoint" %ref_42 = vm.call.variadic @hal.buffer_view.create(%ref_17, %zero_1, %c2016, %c553648160, %c1_5, [%c2, %c4, %c7, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_42 : !vm.ref } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } } // -----// IR Dump Before mlir::iree_compiler::IREE::VM::ReifyRodataTablesPass (iree-vm-reify-rodata-tables) //----- // vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.initializer { %null = vm.const.ref.zero : !vm.ref %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1 = vm.const.i64 1 %null_1 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_1 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %req = vm.cmp.eq.ref %4, %null_1 : !vm.ref %slt = vm.cmp.lt.i64.s %2, %1 : i64 %5 = vm.and.i32 %req, %slt : i32 vm.cond_br %5, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %6 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%6) {nosideeffects} : (i32) -> !vm.ref %buffer = vm.rodata.inline "_utf8_hal_device_id_D0F1B3E9D63E707C" {alignment = 1 : i64} : !vm.buffer = "hal.device.id" %buffer_2 = vm.rodata.inline "_utf8_local_8DC315A014BAFA34" {alignment = 1 : i64} : !vm.buffer = "local*" %7:2 = vm.call @hal.device.query.i64(%ref, %buffer, %buffer_2) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %7#1 : i64 %zero_3 = vm.const.i32.zero %8 = vm.select.i32 %7#0, %nz, %zero_3 : i32 %c1_4 = vm.const.i32 1 vm.cond_br %8, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %buffer_5 = vm.rodata.inline "_utf8_hal_executable_format_1F9665C75F0004D3" {alignment = 1 : i64} : !vm.buffer = "hal.executable.format" %buffer_6 = vm.rodata.inline "_utf8_embedded_elf_x86_64_11EF7D6636570B50" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-x86_64" %9:2 = vm.call @hal.device.query.i64(%ref, %buffer_5, %buffer_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %9#1 : i64 %zero_8 = vm.const.i32.zero %10 = vm.select.i32 %9#0, %nz_7, %zero_8 : i32 %c1_9 = vm.const.i32 1 vm.br ^bb4(%10 : i32) ^bb4(%11: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %12 = vm.select.i64 %11, %c1, %zero_0 : i64 %13 = vm.add.i64 %3, %12 : i64 %14 = vm.and.i32 %11, %eq : i32 %ref_10 = vm.select.ref %14, %ref, %null_1 : !vm.ref %15 = vm.add.i64 %2, %c1 : i64 vm.br ^bb1(%15, %13, %ref_10 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %req, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.cond_fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]>" vm.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %buffer_11 = vm.rodata.inline "_utf8_hal_executable_format_1F9665C75F0004D3" {alignment = 1 : i64} : !vm.buffer = "hal.executable.format" %buffer_12 = vm.rodata.inline "_utf8_embedded_elf_x86_64_11EF7D6636570B50" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-x86_64" %16:2 = vm.call @hal.device.query.i64(%4, %buffer_11, %buffer_12) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_13 = vm.cmp.nz.i64 %16#1 : i64 %zero_14 = vm.const.i32.zero %17 = vm.select.i32 %16#0, %nz_13, %zero_14 : i32 %c1_15 = vm.const.i32 1 %c1_16 = vm.const.i32 1 %zero_17 = vm.const.i32.zero %c7 = vm.const.i32 7 %c3 = vm.const.i32 3 %c1_18 = vm.const.i32 1 %c7_19 = vm.const.i32 7 %c3_20 = vm.const.i32 3 %c2 = vm.const.i32 2 %c7_21 = vm.const.i32 7 %c2_22 = vm.const.i32 2 %ref_23 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1_16, [(%zero_17, %c7, %c3), (%c1_18, %c7_19, %c3_20), (%c2, %c7_21, %c2_22)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %c8 = vm.const.i32 8 %ref_24 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_23]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %18 = vm.select.i64 %17, %zero_0, %c-1 : i64 %eq_25 = vm.cmp.eq.i64 %18, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_25, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %buffer_26 = vm.rodata.inline "main_dispatch_0_embedded_elf_x86_64" {alignment = 16 : i64, mime_type = "application/x-elf"} : !vm.buffer = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> %buffer_27 = vm.rodata.inline "_utf8_embedded_elf_x86_64_11EF7D6636570B50" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-x86_64" %null_28 = vm.const.ref.zero : !vm.buffer %ref_29 = vm.call.variadic @hal.executable.create(%4, %buffer_27, %buffer_26, %null_28, [%ref_24]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.br ^bb10(%ref_29 : !vm.ref) ^bb9: // pred: ^bb7 vm.cond_fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" vm.br ^bb10(%null : !vm.ref) ^bb10(%19: !vm.ref): // 2 preds: ^bb8, ^bb9 vm.global.store.ref %19, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_24, @__device_0_pipeline_layout_0 : !vm.ref vm.return } vm.import private @hal.ex.file.from_memory(%device : !vm.ref, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref vm.import private @hal.allocator.allocate(%allocator : !vm.ref, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.allocator.import(%allocator : !vm.ref, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref, %source_offset : i64, %length : i64) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer.length(%buffer : !vm.ref) -> i64 attributes {nosideeffects} vm.import private @hal.buffer.load(%source_buffer : !vm.ref, %source_offset : i64, %length : i32) -> i32 vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref, %target_offset : i64, %length : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref ...) vm.import private @hal.channel.create(%device : !vm.ref, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.split(%channel : !vm.ref, %color : i32, %key : i32, %flags : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.rank_and_count(%channel : !vm.ref) -> (i32, i32) attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref, %label : !vm.buffer) vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32) vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32) vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64) vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref, %channel : !vm.ref, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref, %recv_buffer : !vm.ref, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64) vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.dealloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %buffer : !vm.ref) vm.import private @hal.device.queue.read(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_file : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.write(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.ref, %source_offset : i64, %target_file : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %binding_table : tuple, i64, i64> ...) vm.import private @hal.device.queue.flush(%device : !vm.ref, %queue_affinity : i64) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create2(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {minimum_version = 4 : i32, nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.join(%fences : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.query(%fence : !vm.ref) -> i32 vm.import private @hal.fence.signal(%fence : !vm.ref) vm.import private @hal.fence.fail(%fence : !vm.ref, %status : i32) vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7 = vm.const.i64 7 %c2 = vm.const.i64 2 %c1 = vm.const.i64 1 %zero = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_0 = vm.const.i64 -1 %c32 = vm.const.i64 32 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %zero_1 = vm.const.i64.zero %c5 = vm.const.i64 5 %c4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %zero_2 = vm.const.i32.zero %0 = vm.call @hal.buffer_view.dim(%arg0, %zero_2) {nosideeffects} : (!vm.ref, i32) -> i64 %c1_3 = vm.const.i32 1 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1_3) {nosideeffects} : (!vm.ref, i32) -> i64 %c2_4 = vm.const.i32 2 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2_4) {nosideeffects} : (!vm.ref, i32) -> i64 %c3 = vm.const.i32 3 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %c553648160 = vm.const.i32 553648160 %c1_5 = vm.const.i32 1 %buffer = vm.rodata.inline "_utf8_input0_DA9A70D360954439" {alignment = 1 : i64} : !vm.buffer = "input0" vm.call.variadic @hal.buffer_view.assert(%arg0, %buffer, %c553648160, %c1_5, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_6 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %buffer_7 = vm.rodata.inline "_utf8_tensor_41A152EEDB094D7A" {alignment = 1 : i64} : !vm.buffer = "tensor" %c16 = vm.const.i32 16 %c3075 = vm.const.i32 3075 vm.call @hal.buffer.assert(%ref, %buffer_7, %ref_6, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %buffer_8 = vm.rodata.inline "_utf8_input1_FDCC539DA203DDD3" {alignment = 1 : i64} : !vm.buffer = "input1" vm.call.variadic @hal.buffer_view.assert(%arg1, %buffer_8, %c553648160, %c1_5, [%c4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_9 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref %buffer_10 = vm.rodata.inline "_utf8_tensor_41A152EEDB094D7A" {alignment = 1 : i64} : !vm.buffer = "tensor" %c16_11 = vm.const.i32 16 %c3075_12 = vm.const.i32 3075 vm.call @hal.buffer.assert(%ref_9, %buffer_10, %ref_6, %c2400, %c16_11, %c3075_12) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %zero_13 = vm.const.i32.zero %ref_14 = vm.call @hal.fence.create(%__device_0, %zero_13) : (!vm.ref, i32) -> !vm.ref %zero_15 = vm.const.i32.zero %c48 = vm.const.i32 48 %c3075_16 = vm.const.i32 3075 %ref_17 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_0, %null, %ref_14, %zero_15, %c48, %c3075_16, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %c32_18 = vm.const.i32 32 %9 = vm.shr.i64.u %0, %c32_18 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %c32_19 = vm.const.i32 32 %12 = vm.shr.i64.u %1, %c32_19 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %c32_20 = vm.const.i32 32 %15 = vm.shr.i64.u %2, %c32_20 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %c32_21 = vm.const.i32 32 %18 = vm.shr.i64.u %3, %c32_21 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %c1_22 = vm.const.i32 1 %c3_23 = vm.const.i32 3 %zero_24 = vm.const.i32.zero %ref_25 = vm.call @hal.command_buffer.create(%__device_0, %c1_22, %c3_23, %c-1_0, %zero_24) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref %zero_26 = vm.const.i32.zero vm.call.variadic @hal.command_buffer.push_constants(%ref_25, %__device_0_pipeline_layout_0, %zero_26, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) %zero_27 = vm.const.i32.zero %zero_28 = vm.const.i32.zero %zero_29 = vm.const.i32.zero %c1_30 = vm.const.i32 1 %zero_31 = vm.const.i32.zero %c2_32 = vm.const.i32 2 %zero_33 = vm.const.i32.zero vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_25, %__device_0_pipeline_layout_0, %zero_27, [(%zero_28, %zero_29, %ref, %zero_1, %7), (%c1_30, %zero_31, %ref_9, %zero_1, %c2400), (%c2_32, %zero_33, %ref_17, %zero_1, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) %zero_34 = vm.const.i32.zero %c7_35 = vm.const.i32 7 %c4_36 = vm.const.i32 4 %c1_37 = vm.const.i32 1 %zero_38 = vm.const.i64.zero vm.call @hal.command_buffer.dispatch(%ref_25, %__device_0_executable_0_main_dispatch_0, %zero_34, %c7_35, %c4_36, %c1_37, %zero_38) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () %c28 = vm.const.i32 28 %c13 = vm.const.i32 13 %zero_39 = vm.const.i32.zero vm.call @hal.command_buffer.execution_barrier(%ref_25, %c28, %c13, %zero_39) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_25) : (!vm.ref) -> () %zero_40 = vm.const.i32.zero %ref_41 = vm.call @hal.fence.create(%__device_0, %zero_40) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_0, %ref_14, %ref_41, [%ref_25]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_41]) : (i32, !vm.ref ...) -> i32 vm.cond_fail %20, "failed to wait on timepoint" %ref_42 = vm.call.variadic @hal.buffer_view.create(%ref_17, %zero_1, %c2016, %c553648160, %c1_5, [%c2, %c4, %c7, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_42 : !vm.ref } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } // -----// IR Dump After mlir::iree_compiler::IREE::VM::ReifyRodataTablesPass (iree-vm-reify-rodata-tables) //----- // vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.initializer { %null = vm.const.ref.zero : !vm.ref %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1 = vm.const.i64 1 %null_1 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_1 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %req = vm.cmp.eq.ref %4, %null_1 : !vm.ref %slt = vm.cmp.lt.i64.s %2, %1 : i64 %5 = vm.and.i32 %req, %slt : i32 vm.cond_br %5, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %6 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%6) {nosideeffects} : (i32) -> !vm.ref %buffer = vm.rodata.inline "_utf8_hal_device_id_D0F1B3E9D63E707C" {alignment = 1 : i64} : !vm.buffer = "hal.device.id" %buffer_2 = vm.rodata.inline "_utf8_local_8DC315A014BAFA34" {alignment = 1 : i64} : !vm.buffer = "local*" %7:2 = vm.call @hal.device.query.i64(%ref, %buffer, %buffer_2) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %7#1 : i64 %zero_3 = vm.const.i32.zero %8 = vm.select.i32 %7#0, %nz, %zero_3 : i32 %c1_4 = vm.const.i32 1 vm.cond_br %8, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %buffer_5 = vm.rodata.inline "_utf8_hal_executable_format_1F9665C75F0004D3" {alignment = 1 : i64} : !vm.buffer = "hal.executable.format" %buffer_6 = vm.rodata.inline "_utf8_embedded_elf_x86_64_11EF7D6636570B50" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-x86_64" %9:2 = vm.call @hal.device.query.i64(%ref, %buffer_5, %buffer_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %9#1 : i64 %zero_8 = vm.const.i32.zero %10 = vm.select.i32 %9#0, %nz_7, %zero_8 : i32 %c1_9 = vm.const.i32 1 vm.br ^bb4(%10 : i32) ^bb4(%11: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %12 = vm.select.i64 %11, %c1, %zero_0 : i64 %13 = vm.add.i64 %3, %12 : i64 %14 = vm.and.i32 %11, %eq : i32 %ref_10 = vm.select.ref %14, %ref, %null_1 : !vm.ref %15 = vm.add.i64 %2, %c1 : i64 vm.br ^bb1(%15, %13, %ref_10 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %req, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.cond_fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]>" vm.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %buffer_11 = vm.rodata.inline "_utf8_hal_executable_format_1F9665C75F0004D3" {alignment = 1 : i64} : !vm.buffer = "hal.executable.format" %buffer_12 = vm.rodata.inline "_utf8_embedded_elf_x86_64_11EF7D6636570B50" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-x86_64" %16:2 = vm.call @hal.device.query.i64(%4, %buffer_11, %buffer_12) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_13 = vm.cmp.nz.i64 %16#1 : i64 %zero_14 = vm.const.i32.zero %17 = vm.select.i32 %16#0, %nz_13, %zero_14 : i32 %c1_15 = vm.const.i32 1 %c1_16 = vm.const.i32 1 %zero_17 = vm.const.i32.zero %c7 = vm.const.i32 7 %c3 = vm.const.i32 3 %c1_18 = vm.const.i32 1 %c7_19 = vm.const.i32 7 %c3_20 = vm.const.i32 3 %c2 = vm.const.i32 2 %c7_21 = vm.const.i32 7 %c2_22 = vm.const.i32 2 %ref_23 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1_16, [(%zero_17, %c7, %c3), (%c1_18, %c7_19, %c3_20), (%c2, %c7_21, %c2_22)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %c8 = vm.const.i32 8 %ref_24 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_23]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %18 = vm.select.i64 %17, %zero_0, %c-1 : i64 %eq_25 = vm.cmp.eq.i64 %18, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_25, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %buffer_26 = vm.rodata.inline "main_dispatch_0_embedded_elf_x86_64" {alignment = 16 : i64, mime_type = "application/x-elf"} : !vm.buffer = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> %buffer_27 = vm.rodata.inline "_utf8_embedded_elf_x86_64_11EF7D6636570B50" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-x86_64" %null_28 = vm.const.ref.zero : !vm.buffer %ref_29 = vm.call.variadic @hal.executable.create(%4, %buffer_27, %buffer_26, %null_28, [%ref_24]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.br ^bb10(%ref_29 : !vm.ref) ^bb9: // pred: ^bb7 vm.cond_fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" vm.br ^bb10(%null : !vm.ref) ^bb10(%19: !vm.ref): // 2 preds: ^bb8, ^bb9 vm.global.store.ref %19, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_24, @__device_0_pipeline_layout_0 : !vm.ref vm.return } vm.import private @hal.ex.file.from_memory(%device : !vm.ref, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref vm.import private @hal.allocator.allocate(%allocator : !vm.ref, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.allocator.import(%allocator : !vm.ref, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref, %source_offset : i64, %length : i64) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer.length(%buffer : !vm.ref) -> i64 attributes {nosideeffects} vm.import private @hal.buffer.load(%source_buffer : !vm.ref, %source_offset : i64, %length : i32) -> i32 vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref, %target_offset : i64, %length : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref ...) vm.import private @hal.channel.create(%device : !vm.ref, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.split(%channel : !vm.ref, %color : i32, %key : i32, %flags : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.rank_and_count(%channel : !vm.ref) -> (i32, i32) attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref, %label : !vm.buffer) vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32) vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32) vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64) vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref, %channel : !vm.ref, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref, %recv_buffer : !vm.ref, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64) vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.dealloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %buffer : !vm.ref) vm.import private @hal.device.queue.read(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_file : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.write(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.ref, %source_offset : i64, %target_file : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %binding_table : tuple, i64, i64> ...) vm.import private @hal.device.queue.flush(%device : !vm.ref, %queue_affinity : i64) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create2(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {minimum_version = 4 : i32, nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.join(%fences : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.query(%fence : !vm.ref) -> i32 vm.import private @hal.fence.signal(%fence : !vm.ref) vm.import private @hal.fence.fail(%fence : !vm.ref, %status : i32) vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7 = vm.const.i64 7 %c2 = vm.const.i64 2 %c1 = vm.const.i64 1 %zero = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_0 = vm.const.i64 -1 %c32 = vm.const.i64 32 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %zero_1 = vm.const.i64.zero %c5 = vm.const.i64 5 %c4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %zero_2 = vm.const.i32.zero %0 = vm.call @hal.buffer_view.dim(%arg0, %zero_2) {nosideeffects} : (!vm.ref, i32) -> i64 %c1_3 = vm.const.i32 1 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1_3) {nosideeffects} : (!vm.ref, i32) -> i64 %c2_4 = vm.const.i32 2 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2_4) {nosideeffects} : (!vm.ref, i32) -> i64 %c3 = vm.const.i32 3 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %c553648160 = vm.const.i32 553648160 %c1_5 = vm.const.i32 1 %buffer = vm.rodata.inline "_utf8_input0_DA9A70D360954439" {alignment = 1 : i64} : !vm.buffer = "input0" vm.call.variadic @hal.buffer_view.assert(%arg0, %buffer, %c553648160, %c1_5, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_6 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %buffer_7 = vm.rodata.inline "_utf8_tensor_41A152EEDB094D7A" {alignment = 1 : i64} : !vm.buffer = "tensor" %c16 = vm.const.i32 16 %c3075 = vm.const.i32 3075 vm.call @hal.buffer.assert(%ref, %buffer_7, %ref_6, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %buffer_8 = vm.rodata.inline "_utf8_input1_FDCC539DA203DDD3" {alignment = 1 : i64} : !vm.buffer = "input1" vm.call.variadic @hal.buffer_view.assert(%arg1, %buffer_8, %c553648160, %c1_5, [%c4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_9 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref %buffer_10 = vm.rodata.inline "_utf8_tensor_41A152EEDB094D7A" {alignment = 1 : i64} : !vm.buffer = "tensor" %c16_11 = vm.const.i32 16 %c3075_12 = vm.const.i32 3075 vm.call @hal.buffer.assert(%ref_9, %buffer_10, %ref_6, %c2400, %c16_11, %c3075_12) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %zero_13 = vm.const.i32.zero %ref_14 = vm.call @hal.fence.create(%__device_0, %zero_13) : (!vm.ref, i32) -> !vm.ref %zero_15 = vm.const.i32.zero %c48 = vm.const.i32 48 %c3075_16 = vm.const.i32 3075 %ref_17 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_0, %null, %ref_14, %zero_15, %c48, %c3075_16, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %c32_18 = vm.const.i32 32 %9 = vm.shr.i64.u %0, %c32_18 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %c32_19 = vm.const.i32 32 %12 = vm.shr.i64.u %1, %c32_19 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %c32_20 = vm.const.i32 32 %15 = vm.shr.i64.u %2, %c32_20 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %c32_21 = vm.const.i32 32 %18 = vm.shr.i64.u %3, %c32_21 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %c1_22 = vm.const.i32 1 %c3_23 = vm.const.i32 3 %zero_24 = vm.const.i32.zero %ref_25 = vm.call @hal.command_buffer.create(%__device_0, %c1_22, %c3_23, %c-1_0, %zero_24) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref %zero_26 = vm.const.i32.zero vm.call.variadic @hal.command_buffer.push_constants(%ref_25, %__device_0_pipeline_layout_0, %zero_26, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) %zero_27 = vm.const.i32.zero %zero_28 = vm.const.i32.zero %zero_29 = vm.const.i32.zero %c1_30 = vm.const.i32 1 %zero_31 = vm.const.i32.zero %c2_32 = vm.const.i32 2 %zero_33 = vm.const.i32.zero vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_25, %__device_0_pipeline_layout_0, %zero_27, [(%zero_28, %zero_29, %ref, %zero_1, %7), (%c1_30, %zero_31, %ref_9, %zero_1, %c2400), (%c2_32, %zero_33, %ref_17, %zero_1, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) %zero_34 = vm.const.i32.zero %c7_35 = vm.const.i32 7 %c4_36 = vm.const.i32 4 %c1_37 = vm.const.i32 1 %zero_38 = vm.const.i64.zero vm.call @hal.command_buffer.dispatch(%ref_25, %__device_0_executable_0_main_dispatch_0, %zero_34, %c7_35, %c4_36, %c1_37, %zero_38) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () %c28 = vm.const.i32 28 %c13 = vm.const.i32 13 %zero_39 = vm.const.i32.zero vm.call @hal.command_buffer.execution_barrier(%ref_25, %c28, %c13, %zero_39) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_25) : (!vm.ref) -> () %zero_40 = vm.const.i32.zero %ref_41 = vm.call @hal.fence.create(%__device_0, %zero_40) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_0, %ref_14, %ref_41, [%ref_25]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_41]) : (i32, !vm.ref ...) -> i32 vm.cond_fail %20, "failed to wait on timepoint" %ref_42 = vm.call.variadic @hal.buffer_view.create(%ref_17, %zero_1, %c2016, %c553648160, %c1_5, [%c2, %c4, %c7, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_42 : !vm.ref } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } // -----// IR Dump Before mlir::iree_compiler::IREE::VM::HoistInlinedRodataPass (iree-vm-hoist-inlined-rodata) //----- // vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.initializer { %null = vm.const.ref.zero : !vm.ref %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1 = vm.const.i64 1 %null_1 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_1 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %req = vm.cmp.eq.ref %4, %null_1 : !vm.ref %slt = vm.cmp.lt.i64.s %2, %1 : i64 %5 = vm.and.i32 %req, %slt : i32 vm.cond_br %5, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %6 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%6) {nosideeffects} : (i32) -> !vm.ref %buffer = vm.rodata.inline "_utf8_hal_device_id_D0F1B3E9D63E707C" {alignment = 1 : i64} : !vm.buffer = "hal.device.id" %buffer_2 = vm.rodata.inline "_utf8_local_8DC315A014BAFA34" {alignment = 1 : i64} : !vm.buffer = "local*" %7:2 = vm.call @hal.device.query.i64(%ref, %buffer, %buffer_2) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %7#1 : i64 %zero_3 = vm.const.i32.zero %8 = vm.select.i32 %7#0, %nz, %zero_3 : i32 %c1_4 = vm.const.i32 1 vm.cond_br %8, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %buffer_5 = vm.rodata.inline "_utf8_hal_executable_format_1F9665C75F0004D3" {alignment = 1 : i64} : !vm.buffer = "hal.executable.format" %buffer_6 = vm.rodata.inline "_utf8_embedded_elf_x86_64_11EF7D6636570B50" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-x86_64" %9:2 = vm.call @hal.device.query.i64(%ref, %buffer_5, %buffer_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %9#1 : i64 %zero_8 = vm.const.i32.zero %10 = vm.select.i32 %9#0, %nz_7, %zero_8 : i32 %c1_9 = vm.const.i32 1 vm.br ^bb4(%10 : i32) ^bb4(%11: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %12 = vm.select.i64 %11, %c1, %zero_0 : i64 %13 = vm.add.i64 %3, %12 : i64 %14 = vm.and.i32 %11, %eq : i32 %ref_10 = vm.select.ref %14, %ref, %null_1 : !vm.ref %15 = vm.add.i64 %2, %c1 : i64 vm.br ^bb1(%15, %13, %ref_10 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %req, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.cond_fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]>" vm.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %buffer_11 = vm.rodata.inline "_utf8_hal_executable_format_1F9665C75F0004D3" {alignment = 1 : i64} : !vm.buffer = "hal.executable.format" %buffer_12 = vm.rodata.inline "_utf8_embedded_elf_x86_64_11EF7D6636570B50" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-x86_64" %16:2 = vm.call @hal.device.query.i64(%4, %buffer_11, %buffer_12) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_13 = vm.cmp.nz.i64 %16#1 : i64 %zero_14 = vm.const.i32.zero %17 = vm.select.i32 %16#0, %nz_13, %zero_14 : i32 %c1_15 = vm.const.i32 1 %c1_16 = vm.const.i32 1 %zero_17 = vm.const.i32.zero %c7 = vm.const.i32 7 %c3 = vm.const.i32 3 %c1_18 = vm.const.i32 1 %c7_19 = vm.const.i32 7 %c3_20 = vm.const.i32 3 %c2 = vm.const.i32 2 %c7_21 = vm.const.i32 7 %c2_22 = vm.const.i32 2 %ref_23 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1_16, [(%zero_17, %c7, %c3), (%c1_18, %c7_19, %c3_20), (%c2, %c7_21, %c2_22)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %c8 = vm.const.i32 8 %ref_24 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_23]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %18 = vm.select.i64 %17, %zero_0, %c-1 : i64 %eq_25 = vm.cmp.eq.i64 %18, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_25, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %buffer_26 = vm.rodata.inline "main_dispatch_0_embedded_elf_x86_64" {alignment = 16 : i64, mime_type = "application/x-elf"} : !vm.buffer = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> %buffer_27 = vm.rodata.inline "_utf8_embedded_elf_x86_64_11EF7D6636570B50" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-x86_64" %null_28 = vm.const.ref.zero : !vm.buffer %ref_29 = vm.call.variadic @hal.executable.create(%4, %buffer_27, %buffer_26, %null_28, [%ref_24]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.br ^bb10(%ref_29 : !vm.ref) ^bb9: // pred: ^bb7 vm.cond_fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" vm.br ^bb10(%null : !vm.ref) ^bb10(%19: !vm.ref): // 2 preds: ^bb8, ^bb9 vm.global.store.ref %19, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_24, @__device_0_pipeline_layout_0 : !vm.ref vm.return } vm.import private @hal.ex.file.from_memory(%device : !vm.ref, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref vm.import private @hal.allocator.allocate(%allocator : !vm.ref, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.allocator.import(%allocator : !vm.ref, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref, %source_offset : i64, %length : i64) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer.length(%buffer : !vm.ref) -> i64 attributes {nosideeffects} vm.import private @hal.buffer.load(%source_buffer : !vm.ref, %source_offset : i64, %length : i32) -> i32 vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref, %target_offset : i64, %length : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref ...) vm.import private @hal.channel.create(%device : !vm.ref, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.split(%channel : !vm.ref, %color : i32, %key : i32, %flags : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.rank_and_count(%channel : !vm.ref) -> (i32, i32) attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref, %label : !vm.buffer) vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32) vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32) vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64) vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref, %channel : !vm.ref, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref, %recv_buffer : !vm.ref, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64) vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.dealloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %buffer : !vm.ref) vm.import private @hal.device.queue.read(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_file : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.write(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.ref, %source_offset : i64, %target_file : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %binding_table : tuple, i64, i64> ...) vm.import private @hal.device.queue.flush(%device : !vm.ref, %queue_affinity : i64) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create2(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {minimum_version = 4 : i32, nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.join(%fences : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.query(%fence : !vm.ref) -> i32 vm.import private @hal.fence.signal(%fence : !vm.ref) vm.import private @hal.fence.fail(%fence : !vm.ref, %status : i32) vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7 = vm.const.i64 7 %c2 = vm.const.i64 2 %c1 = vm.const.i64 1 %zero = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_0 = vm.const.i64 -1 %c32 = vm.const.i64 32 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %zero_1 = vm.const.i64.zero %c5 = vm.const.i64 5 %c4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %zero_2 = vm.const.i32.zero %0 = vm.call @hal.buffer_view.dim(%arg0, %zero_2) {nosideeffects} : (!vm.ref, i32) -> i64 %c1_3 = vm.const.i32 1 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1_3) {nosideeffects} : (!vm.ref, i32) -> i64 %c2_4 = vm.const.i32 2 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2_4) {nosideeffects} : (!vm.ref, i32) -> i64 %c3 = vm.const.i32 3 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %c553648160 = vm.const.i32 553648160 %c1_5 = vm.const.i32 1 %buffer = vm.rodata.inline "_utf8_input0_DA9A70D360954439" {alignment = 1 : i64} : !vm.buffer = "input0" vm.call.variadic @hal.buffer_view.assert(%arg0, %buffer, %c553648160, %c1_5, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_6 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %buffer_7 = vm.rodata.inline "_utf8_tensor_41A152EEDB094D7A" {alignment = 1 : i64} : !vm.buffer = "tensor" %c16 = vm.const.i32 16 %c3075 = vm.const.i32 3075 vm.call @hal.buffer.assert(%ref, %buffer_7, %ref_6, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %buffer_8 = vm.rodata.inline "_utf8_input1_FDCC539DA203DDD3" {alignment = 1 : i64} : !vm.buffer = "input1" vm.call.variadic @hal.buffer_view.assert(%arg1, %buffer_8, %c553648160, %c1_5, [%c4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_9 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref %buffer_10 = vm.rodata.inline "_utf8_tensor_41A152EEDB094D7A" {alignment = 1 : i64} : !vm.buffer = "tensor" %c16_11 = vm.const.i32 16 %c3075_12 = vm.const.i32 3075 vm.call @hal.buffer.assert(%ref_9, %buffer_10, %ref_6, %c2400, %c16_11, %c3075_12) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %zero_13 = vm.const.i32.zero %ref_14 = vm.call @hal.fence.create(%__device_0, %zero_13) : (!vm.ref, i32) -> !vm.ref %zero_15 = vm.const.i32.zero %c48 = vm.const.i32 48 %c3075_16 = vm.const.i32 3075 %ref_17 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_0, %null, %ref_14, %zero_15, %c48, %c3075_16, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %c32_18 = vm.const.i32 32 %9 = vm.shr.i64.u %0, %c32_18 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %c32_19 = vm.const.i32 32 %12 = vm.shr.i64.u %1, %c32_19 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %c32_20 = vm.const.i32 32 %15 = vm.shr.i64.u %2, %c32_20 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %c32_21 = vm.const.i32 32 %18 = vm.shr.i64.u %3, %c32_21 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %c1_22 = vm.const.i32 1 %c3_23 = vm.const.i32 3 %zero_24 = vm.const.i32.zero %ref_25 = vm.call @hal.command_buffer.create(%__device_0, %c1_22, %c3_23, %c-1_0, %zero_24) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref %zero_26 = vm.const.i32.zero vm.call.variadic @hal.command_buffer.push_constants(%ref_25, %__device_0_pipeline_layout_0, %zero_26, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) %zero_27 = vm.const.i32.zero %zero_28 = vm.const.i32.zero %zero_29 = vm.const.i32.zero %c1_30 = vm.const.i32 1 %zero_31 = vm.const.i32.zero %c2_32 = vm.const.i32 2 %zero_33 = vm.const.i32.zero vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_25, %__device_0_pipeline_layout_0, %zero_27, [(%zero_28, %zero_29, %ref, %zero_1, %7), (%c1_30, %zero_31, %ref_9, %zero_1, %c2400), (%c2_32, %zero_33, %ref_17, %zero_1, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) %zero_34 = vm.const.i32.zero %c7_35 = vm.const.i32 7 %c4_36 = vm.const.i32 4 %c1_37 = vm.const.i32 1 %zero_38 = vm.const.i64.zero vm.call @hal.command_buffer.dispatch(%ref_25, %__device_0_executable_0_main_dispatch_0, %zero_34, %c7_35, %c4_36, %c1_37, %zero_38) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () %c28 = vm.const.i32 28 %c13 = vm.const.i32 13 %zero_39 = vm.const.i32.zero vm.call @hal.command_buffer.execution_barrier(%ref_25, %c28, %c13, %zero_39) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_25) : (!vm.ref) -> () %zero_40 = vm.const.i32.zero %ref_41 = vm.call @hal.fence.create(%__device_0, %zero_40) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_0, %ref_14, %ref_41, [%ref_25]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_41]) : (i32, !vm.ref ...) -> i32 vm.cond_fail %20, "failed to wait on timepoint" %ref_42 = vm.call.variadic @hal.buffer_view.create(%ref_17, %zero_1, %c2016, %c553648160, %c1_5, [%c2, %c4, %c7, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_42 : !vm.ref } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } // -----// IR Dump After mlir::iree_compiler::IREE::VM::HoistInlinedRodataPass (iree-vm-hoist-inlined-rodata) //----- // vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3_0 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50_1 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50_2 {alignment = 1 : i64} "embedded-elf-x86_64" vm.initializer { %null = vm.const.ref.zero : !vm.ref %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1 = vm.const.i64 1 %null_1 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_1 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %req = vm.cmp.eq.ref %4, %null_1 : !vm.ref %slt = vm.cmp.lt.i64.s %2, %1 : i64 %5 = vm.and.i32 %req, %slt : i32 vm.cond_br %5, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %6 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%6) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %7:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %7#1 : i64 %zero_2 = vm.const.i32.zero %8 = vm.select.i32 %7#0, %nz, %zero_2 : i32 %c1_3 = vm.const.i32 1 vm.cond_br %8, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %9:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_4 = vm.cmp.nz.i64 %9#1 : i64 %zero_5 = vm.const.i32.zero %10 = vm.select.i32 %9#0, %nz_4, %zero_5 : i32 %c1_6 = vm.const.i32 1 vm.br ^bb4(%10 : i32) ^bb4(%11: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %12 = vm.select.i64 %11, %c1, %zero_0 : i64 %13 = vm.add.i64 %3, %12 : i64 %14 = vm.and.i32 %11, %eq : i32 %ref_7 = vm.select.ref %14, %ref, %null_1 : !vm.ref %15 = vm.add.i64 %2, %c1 : i64 vm.br ^bb1(%15, %13, %ref_7 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %req, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.cond_fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]>" vm.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %_utf8_hal_executable_format_1F9665C75F0004D3_0 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3_0 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_1 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50_1 : !vm.buffer %16:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_0, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_1) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_8 = vm.cmp.nz.i64 %16#1 : i64 %zero_9 = vm.const.i32.zero %17 = vm.select.i32 %16#0, %nz_8, %zero_9 : i32 %c1_10 = vm.const.i32 1 %c1_11 = vm.const.i32 1 %zero_12 = vm.const.i32.zero %c7 = vm.const.i32 7 %c3 = vm.const.i32 3 %c1_13 = vm.const.i32 1 %c7_14 = vm.const.i32 7 %c3_15 = vm.const.i32 3 %c2 = vm.const.i32 2 %c7_16 = vm.const.i32 7 %c2_17 = vm.const.i32 2 %ref_18 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1_11, [(%zero_12, %c7, %c3), (%c1_13, %c7_14, %c3_15), (%c2, %c7_16, %c2_17)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %c8 = vm.const.i32 8 %ref_19 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_18]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %18 = vm.select.i64 %17, %zero_0, %c-1 : i64 %eq_20 = vm.cmp.eq.i64 %18, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_20, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_2 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50_2 : !vm.buffer %null_21 = vm.const.ref.zero : !vm.buffer %ref_22 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_2, %main_dispatch_0_embedded_elf_x86_64, %null_21, [%ref_19]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.br ^bb10(%ref_22 : !vm.ref) ^bb9: // pred: ^bb7 vm.cond_fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" vm.br ^bb10(%null : !vm.ref) ^bb10(%19: !vm.ref): // 2 preds: ^bb8, ^bb9 vm.global.store.ref %19, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_19, @__device_0_pipeline_layout_0 : !vm.ref vm.return } vm.import private @hal.ex.file.from_memory(%device : !vm.ref, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref vm.import private @hal.allocator.allocate(%allocator : !vm.ref, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.allocator.import(%allocator : !vm.ref, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref, %source_offset : i64, %length : i64) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer.length(%buffer : !vm.ref) -> i64 attributes {nosideeffects} vm.import private @hal.buffer.load(%source_buffer : !vm.ref, %source_offset : i64, %length : i32) -> i32 vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref, %target_offset : i64, %length : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref ...) vm.import private @hal.channel.create(%device : !vm.ref, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.split(%channel : !vm.ref, %color : i32, %key : i32, %flags : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.rank_and_count(%channel : !vm.ref) -> (i32, i32) attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref, %label : !vm.buffer) vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32) vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32) vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64) vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref, %channel : !vm.ref, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref, %recv_buffer : !vm.ref, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64) vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.dealloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %buffer : !vm.ref) vm.import private @hal.device.queue.read(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_file : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.write(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.ref, %source_offset : i64, %target_file : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %binding_table : tuple, i64, i64> ...) vm.import private @hal.device.queue.flush(%device : !vm.ref, %queue_affinity : i64) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create2(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {minimum_version = 4 : i32, nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.join(%fences : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.query(%fence : !vm.ref) -> i32 vm.import private @hal.fence.signal(%fence : !vm.ref) vm.import private @hal.fence.fail(%fence : !vm.ref, %status : i32) vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.rodata private @_utf8_tensor_41A152EEDB094D7A_3 {alignment = 1 : i64} "tensor" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7 = vm.const.i64 7 %c2 = vm.const.i64 2 %c1 = vm.const.i64 1 %zero = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_0 = vm.const.i64 -1 %c32 = vm.const.i64 32 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %zero_1 = vm.const.i64.zero %c5 = vm.const.i64 5 %c4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %zero_2 = vm.const.i32.zero %0 = vm.call @hal.buffer_view.dim(%arg0, %zero_2) {nosideeffects} : (!vm.ref, i32) -> i64 %c1_3 = vm.const.i32 1 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1_3) {nosideeffects} : (!vm.ref, i32) -> i64 %c2_4 = vm.const.i32 2 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2_4) {nosideeffects} : (!vm.ref, i32) -> i64 %c3 = vm.const.i32 3 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %c553648160 = vm.const.i32 553648160 %c1_5 = vm.const.i32 1 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1_5, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_6 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer %c16 = vm.const.i32 16 %c3075 = vm.const.i32 3075 vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_6, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1_5, [%c4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_7 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A_3 = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A_3 : !vm.buffer %c16_8 = vm.const.i32 16 %c3075_9 = vm.const.i32 3075 vm.call @hal.buffer.assert(%ref_7, %_utf8_tensor_41A152EEDB094D7A_3, %ref_6, %c2400, %c16_8, %c3075_9) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %zero_10 = vm.const.i32.zero %ref_11 = vm.call @hal.fence.create(%__device_0, %zero_10) : (!vm.ref, i32) -> !vm.ref %zero_12 = vm.const.i32.zero %c48 = vm.const.i32 48 %c3075_13 = vm.const.i32 3075 %ref_14 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_0, %null, %ref_11, %zero_12, %c48, %c3075_13, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %c32_15 = vm.const.i32 32 %9 = vm.shr.i64.u %0, %c32_15 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %c32_16 = vm.const.i32 32 %12 = vm.shr.i64.u %1, %c32_16 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %c32_17 = vm.const.i32 32 %15 = vm.shr.i64.u %2, %c32_17 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %c32_18 = vm.const.i32 32 %18 = vm.shr.i64.u %3, %c32_18 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %c1_19 = vm.const.i32 1 %c3_20 = vm.const.i32 3 %zero_21 = vm.const.i32.zero %ref_22 = vm.call @hal.command_buffer.create(%__device_0, %c1_19, %c3_20, %c-1_0, %zero_21) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref %zero_23 = vm.const.i32.zero vm.call.variadic @hal.command_buffer.push_constants(%ref_22, %__device_0_pipeline_layout_0, %zero_23, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) %zero_24 = vm.const.i32.zero %zero_25 = vm.const.i32.zero %zero_26 = vm.const.i32.zero %c1_27 = vm.const.i32 1 %zero_28 = vm.const.i32.zero %c2_29 = vm.const.i32 2 %zero_30 = vm.const.i32.zero vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_22, %__device_0_pipeline_layout_0, %zero_24, [(%zero_25, %zero_26, %ref, %zero_1, %7), (%c1_27, %zero_28, %ref_7, %zero_1, %c2400), (%c2_29, %zero_30, %ref_14, %zero_1, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) %zero_31 = vm.const.i32.zero %c7_32 = vm.const.i32 7 %c4_33 = vm.const.i32 4 %c1_34 = vm.const.i32 1 %zero_35 = vm.const.i64.zero vm.call @hal.command_buffer.dispatch(%ref_22, %__device_0_executable_0_main_dispatch_0, %zero_31, %c7_32, %c4_33, %c1_34, %zero_35) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () %c28 = vm.const.i32 28 %c13 = vm.const.i32 13 %zero_36 = vm.const.i32.zero vm.call @hal.command_buffer.execution_barrier(%ref_22, %c28, %c13, %zero_36) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_22) : (!vm.ref) -> () %zero_37 = vm.const.i32.zero %ref_38 = vm.call @hal.fence.create(%__device_0, %zero_37) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_0, %ref_11, %ref_38, [%ref_22]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_38]) : (i32, !vm.ref ...) -> i32 vm.cond_fail %20, "failed to wait on timepoint" %ref_39 = vm.call.variadic @hal.buffer_view.create(%ref_14, %zero_1, %c2016, %c553648160, %c1_5, [%c2, %c4, %c7, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_39 : !vm.ref } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } // -----// IR Dump Before mlir::iree_compiler::IREE::VM::DeduplicateRodataPass (iree-vm-deduplicate-rodata) //----- // vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3_0 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50_1 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50_2 {alignment = 1 : i64} "embedded-elf-x86_64" vm.initializer { %null = vm.const.ref.zero : !vm.ref %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1 = vm.const.i64 1 %null_1 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_1 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %req = vm.cmp.eq.ref %4, %null_1 : !vm.ref %slt = vm.cmp.lt.i64.s %2, %1 : i64 %5 = vm.and.i32 %req, %slt : i32 vm.cond_br %5, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %6 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%6) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %7:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %7#1 : i64 %zero_2 = vm.const.i32.zero %8 = vm.select.i32 %7#0, %nz, %zero_2 : i32 %c1_3 = vm.const.i32 1 vm.cond_br %8, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %9:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_4 = vm.cmp.nz.i64 %9#1 : i64 %zero_5 = vm.const.i32.zero %10 = vm.select.i32 %9#0, %nz_4, %zero_5 : i32 %c1_6 = vm.const.i32 1 vm.br ^bb4(%10 : i32) ^bb4(%11: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %12 = vm.select.i64 %11, %c1, %zero_0 : i64 %13 = vm.add.i64 %3, %12 : i64 %14 = vm.and.i32 %11, %eq : i32 %ref_7 = vm.select.ref %14, %ref, %null_1 : !vm.ref %15 = vm.add.i64 %2, %c1 : i64 vm.br ^bb1(%15, %13, %ref_7 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %req, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.cond_fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]>" vm.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %_utf8_hal_executable_format_1F9665C75F0004D3_0 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3_0 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_1 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50_1 : !vm.buffer %16:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_0, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_1) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_8 = vm.cmp.nz.i64 %16#1 : i64 %zero_9 = vm.const.i32.zero %17 = vm.select.i32 %16#0, %nz_8, %zero_9 : i32 %c1_10 = vm.const.i32 1 %c1_11 = vm.const.i32 1 %zero_12 = vm.const.i32.zero %c7 = vm.const.i32 7 %c3 = vm.const.i32 3 %c1_13 = vm.const.i32 1 %c7_14 = vm.const.i32 7 %c3_15 = vm.const.i32 3 %c2 = vm.const.i32 2 %c7_16 = vm.const.i32 7 %c2_17 = vm.const.i32 2 %ref_18 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1_11, [(%zero_12, %c7, %c3), (%c1_13, %c7_14, %c3_15), (%c2, %c7_16, %c2_17)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %c8 = vm.const.i32 8 %ref_19 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_18]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %18 = vm.select.i64 %17, %zero_0, %c-1 : i64 %eq_20 = vm.cmp.eq.i64 %18, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_20, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_2 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50_2 : !vm.buffer %null_21 = vm.const.ref.zero : !vm.buffer %ref_22 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_2, %main_dispatch_0_embedded_elf_x86_64, %null_21, [%ref_19]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.br ^bb10(%ref_22 : !vm.ref) ^bb9: // pred: ^bb7 vm.cond_fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" vm.br ^bb10(%null : !vm.ref) ^bb10(%19: !vm.ref): // 2 preds: ^bb8, ^bb9 vm.global.store.ref %19, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_19, @__device_0_pipeline_layout_0 : !vm.ref vm.return } vm.import private @hal.ex.file.from_memory(%device : !vm.ref, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref vm.import private @hal.allocator.allocate(%allocator : !vm.ref, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.allocator.import(%allocator : !vm.ref, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref, %source_offset : i64, %length : i64) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer.length(%buffer : !vm.ref) -> i64 attributes {nosideeffects} vm.import private @hal.buffer.load(%source_buffer : !vm.ref, %source_offset : i64, %length : i32) -> i32 vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref, %target_offset : i64, %length : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref ...) vm.import private @hal.channel.create(%device : !vm.ref, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.split(%channel : !vm.ref, %color : i32, %key : i32, %flags : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.rank_and_count(%channel : !vm.ref) -> (i32, i32) attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref, %label : !vm.buffer) vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32) vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32) vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64) vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref, %channel : !vm.ref, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref, %recv_buffer : !vm.ref, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64) vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.dealloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %buffer : !vm.ref) vm.import private @hal.device.queue.read(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_file : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.write(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.ref, %source_offset : i64, %target_file : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %binding_table : tuple, i64, i64> ...) vm.import private @hal.device.queue.flush(%device : !vm.ref, %queue_affinity : i64) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create2(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {minimum_version = 4 : i32, nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.join(%fences : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.query(%fence : !vm.ref) -> i32 vm.import private @hal.fence.signal(%fence : !vm.ref) vm.import private @hal.fence.fail(%fence : !vm.ref, %status : i32) vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.rodata private @_utf8_tensor_41A152EEDB094D7A_3 {alignment = 1 : i64} "tensor" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7 = vm.const.i64 7 %c2 = vm.const.i64 2 %c1 = vm.const.i64 1 %zero = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_0 = vm.const.i64 -1 %c32 = vm.const.i64 32 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %zero_1 = vm.const.i64.zero %c5 = vm.const.i64 5 %c4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %zero_2 = vm.const.i32.zero %0 = vm.call @hal.buffer_view.dim(%arg0, %zero_2) {nosideeffects} : (!vm.ref, i32) -> i64 %c1_3 = vm.const.i32 1 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1_3) {nosideeffects} : (!vm.ref, i32) -> i64 %c2_4 = vm.const.i32 2 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2_4) {nosideeffects} : (!vm.ref, i32) -> i64 %c3 = vm.const.i32 3 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %c553648160 = vm.const.i32 553648160 %c1_5 = vm.const.i32 1 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1_5, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_6 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer %c16 = vm.const.i32 16 %c3075 = vm.const.i32 3075 vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_6, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1_5, [%c4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_7 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A_3 = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A_3 : !vm.buffer %c16_8 = vm.const.i32 16 %c3075_9 = vm.const.i32 3075 vm.call @hal.buffer.assert(%ref_7, %_utf8_tensor_41A152EEDB094D7A_3, %ref_6, %c2400, %c16_8, %c3075_9) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %zero_10 = vm.const.i32.zero %ref_11 = vm.call @hal.fence.create(%__device_0, %zero_10) : (!vm.ref, i32) -> !vm.ref %zero_12 = vm.const.i32.zero %c48 = vm.const.i32 48 %c3075_13 = vm.const.i32 3075 %ref_14 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_0, %null, %ref_11, %zero_12, %c48, %c3075_13, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %c32_15 = vm.const.i32 32 %9 = vm.shr.i64.u %0, %c32_15 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %c32_16 = vm.const.i32 32 %12 = vm.shr.i64.u %1, %c32_16 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %c32_17 = vm.const.i32 32 %15 = vm.shr.i64.u %2, %c32_17 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %c32_18 = vm.const.i32 32 %18 = vm.shr.i64.u %3, %c32_18 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %c1_19 = vm.const.i32 1 %c3_20 = vm.const.i32 3 %zero_21 = vm.const.i32.zero %ref_22 = vm.call @hal.command_buffer.create(%__device_0, %c1_19, %c3_20, %c-1_0, %zero_21) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref %zero_23 = vm.const.i32.zero vm.call.variadic @hal.command_buffer.push_constants(%ref_22, %__device_0_pipeline_layout_0, %zero_23, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) %zero_24 = vm.const.i32.zero %zero_25 = vm.const.i32.zero %zero_26 = vm.const.i32.zero %c1_27 = vm.const.i32 1 %zero_28 = vm.const.i32.zero %c2_29 = vm.const.i32 2 %zero_30 = vm.const.i32.zero vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_22, %__device_0_pipeline_layout_0, %zero_24, [(%zero_25, %zero_26, %ref, %zero_1, %7), (%c1_27, %zero_28, %ref_7, %zero_1, %c2400), (%c2_29, %zero_30, %ref_14, %zero_1, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) %zero_31 = vm.const.i32.zero %c7_32 = vm.const.i32 7 %c4_33 = vm.const.i32 4 %c1_34 = vm.const.i32 1 %zero_35 = vm.const.i64.zero vm.call @hal.command_buffer.dispatch(%ref_22, %__device_0_executable_0_main_dispatch_0, %zero_31, %c7_32, %c4_33, %c1_34, %zero_35) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () %c28 = vm.const.i32 28 %c13 = vm.const.i32 13 %zero_36 = vm.const.i32.zero vm.call @hal.command_buffer.execution_barrier(%ref_22, %c28, %c13, %zero_36) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_22) : (!vm.ref) -> () %zero_37 = vm.const.i32.zero %ref_38 = vm.call @hal.fence.create(%__device_0, %zero_37) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_0, %ref_11, %ref_38, [%ref_22]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_38]) : (i32, !vm.ref ...) -> i32 vm.cond_fail %20, "failed to wait on timepoint" %ref_39 = vm.call.variadic @hal.buffer_view.create(%ref_14, %zero_1, %c2016, %c553648160, %c1_5, [%c2, %c4, %c7, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_39 : !vm.ref } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } // -----// IR Dump After mlir::iree_compiler::IREE::VM::DeduplicateRodataPass (iree-vm-deduplicate-rodata) //----- // vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.initializer { %null = vm.const.ref.zero : !vm.ref %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1 = vm.const.i64 1 %null_1 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_1 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %req = vm.cmp.eq.ref %4, %null_1 : !vm.ref %slt = vm.cmp.lt.i64.s %2, %1 : i64 %5 = vm.and.i32 %req, %slt : i32 vm.cond_br %5, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %6 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%6) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %7:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %7#1 : i64 %zero_2 = vm.const.i32.zero %8 = vm.select.i32 %7#0, %nz, %zero_2 : i32 %c1_3 = vm.const.i32 1 vm.cond_br %8, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %9:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_4 = vm.cmp.nz.i64 %9#1 : i64 %zero_5 = vm.const.i32.zero %10 = vm.select.i32 %9#0, %nz_4, %zero_5 : i32 %c1_6 = vm.const.i32 1 vm.br ^bb4(%10 : i32) ^bb4(%11: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %12 = vm.select.i64 %11, %c1, %zero_0 : i64 %13 = vm.add.i64 %3, %12 : i64 %14 = vm.and.i32 %11, %eq : i32 %ref_7 = vm.select.ref %14, %ref, %null_1 : !vm.ref %15 = vm.add.i64 %2, %c1 : i64 vm.br ^bb1(%15, %13, %ref_7 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %req, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.cond_fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]>" vm.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %_utf8_hal_executable_format_1F9665C75F0004D3_8 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_9 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %16:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_8, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_9) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_10 = vm.cmp.nz.i64 %16#1 : i64 %zero_11 = vm.const.i32.zero %17 = vm.select.i32 %16#0, %nz_10, %zero_11 : i32 %c1_12 = vm.const.i32 1 %c1_13 = vm.const.i32 1 %zero_14 = vm.const.i32.zero %c7 = vm.const.i32 7 %c3 = vm.const.i32 3 %c1_15 = vm.const.i32 1 %c7_16 = vm.const.i32 7 %c3_17 = vm.const.i32 3 %c2 = vm.const.i32 2 %c7_18 = vm.const.i32 7 %c2_19 = vm.const.i32 2 %ref_20 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1_13, [(%zero_14, %c7, %c3), (%c1_15, %c7_16, %c3_17), (%c2, %c7_18, %c2_19)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %c8 = vm.const.i32 8 %ref_21 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_20]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %18 = vm.select.i64 %17, %zero_0, %c-1 : i64 %eq_22 = vm.cmp.eq.i64 %18, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_22, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_23 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %null_24 = vm.const.ref.zero : !vm.buffer %ref_25 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_23, %main_dispatch_0_embedded_elf_x86_64, %null_24, [%ref_21]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.br ^bb10(%ref_25 : !vm.ref) ^bb9: // pred: ^bb7 vm.cond_fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" vm.br ^bb10(%null : !vm.ref) ^bb10(%19: !vm.ref): // 2 preds: ^bb8, ^bb9 vm.global.store.ref %19, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_21, @__device_0_pipeline_layout_0 : !vm.ref vm.return } vm.import private @hal.ex.file.from_memory(%device : !vm.ref, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref vm.import private @hal.allocator.allocate(%allocator : !vm.ref, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.allocator.import(%allocator : !vm.ref, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref, %source_offset : i64, %length : i64) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer.length(%buffer : !vm.ref) -> i64 attributes {nosideeffects} vm.import private @hal.buffer.load(%source_buffer : !vm.ref, %source_offset : i64, %length : i32) -> i32 vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref, %target_offset : i64, %length : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref ...) vm.import private @hal.channel.create(%device : !vm.ref, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.split(%channel : !vm.ref, %color : i32, %key : i32, %flags : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.rank_and_count(%channel : !vm.ref) -> (i32, i32) attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref, %label : !vm.buffer) vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32) vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32) vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64) vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref, %channel : !vm.ref, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref, %recv_buffer : !vm.ref, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64) vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.dealloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %buffer : !vm.ref) vm.import private @hal.device.queue.read(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_file : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.write(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.ref, %source_offset : i64, %target_file : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %binding_table : tuple, i64, i64> ...) vm.import private @hal.device.queue.flush(%device : !vm.ref, %queue_affinity : i64) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create2(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {minimum_version = 4 : i32, nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.join(%fences : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.query(%fence : !vm.ref) -> i32 vm.import private @hal.fence.signal(%fence : !vm.ref) vm.import private @hal.fence.fail(%fence : !vm.ref, %status : i32) vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7 = vm.const.i64 7 %c2 = vm.const.i64 2 %c1 = vm.const.i64 1 %zero = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_0 = vm.const.i64 -1 %c32 = vm.const.i64 32 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %zero_1 = vm.const.i64.zero %c5 = vm.const.i64 5 %c4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %zero_2 = vm.const.i32.zero %0 = vm.call @hal.buffer_view.dim(%arg0, %zero_2) {nosideeffects} : (!vm.ref, i32) -> i64 %c1_3 = vm.const.i32 1 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1_3) {nosideeffects} : (!vm.ref, i32) -> i64 %c2_4 = vm.const.i32 2 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2_4) {nosideeffects} : (!vm.ref, i32) -> i64 %c3 = vm.const.i32 3 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %c553648160 = vm.const.i32 553648160 %c1_5 = vm.const.i32 1 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1_5, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_6 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer %c16 = vm.const.i32 16 %c3075 = vm.const.i32 3075 vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_6, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1_5, [%c4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_7 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A_8 = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer %c16_9 = vm.const.i32 16 %c3075_10 = vm.const.i32 3075 vm.call @hal.buffer.assert(%ref_7, %_utf8_tensor_41A152EEDB094D7A_8, %ref_6, %c2400, %c16_9, %c3075_10) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %zero_11 = vm.const.i32.zero %ref_12 = vm.call @hal.fence.create(%__device_0, %zero_11) : (!vm.ref, i32) -> !vm.ref %zero_13 = vm.const.i32.zero %c48 = vm.const.i32 48 %c3075_14 = vm.const.i32 3075 %ref_15 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_0, %null, %ref_12, %zero_13, %c48, %c3075_14, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %c32_16 = vm.const.i32 32 %9 = vm.shr.i64.u %0, %c32_16 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %c32_17 = vm.const.i32 32 %12 = vm.shr.i64.u %1, %c32_17 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %c32_18 = vm.const.i32 32 %15 = vm.shr.i64.u %2, %c32_18 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %c32_19 = vm.const.i32 32 %18 = vm.shr.i64.u %3, %c32_19 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %c1_20 = vm.const.i32 1 %c3_21 = vm.const.i32 3 %zero_22 = vm.const.i32.zero %ref_23 = vm.call @hal.command_buffer.create(%__device_0, %c1_20, %c3_21, %c-1_0, %zero_22) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref %zero_24 = vm.const.i32.zero vm.call.variadic @hal.command_buffer.push_constants(%ref_23, %__device_0_pipeline_layout_0, %zero_24, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) %zero_25 = vm.const.i32.zero %zero_26 = vm.const.i32.zero %zero_27 = vm.const.i32.zero %c1_28 = vm.const.i32 1 %zero_29 = vm.const.i32.zero %c2_30 = vm.const.i32 2 %zero_31 = vm.const.i32.zero vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_23, %__device_0_pipeline_layout_0, %zero_25, [(%zero_26, %zero_27, %ref, %zero_1, %7), (%c1_28, %zero_29, %ref_7, %zero_1, %c2400), (%c2_30, %zero_31, %ref_15, %zero_1, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) %zero_32 = vm.const.i32.zero %c7_33 = vm.const.i32 7 %c4_34 = vm.const.i32 4 %c1_35 = vm.const.i32 1 %zero_36 = vm.const.i64.zero vm.call @hal.command_buffer.dispatch(%ref_23, %__device_0_executable_0_main_dispatch_0, %zero_32, %c7_33, %c4_34, %c1_35, %zero_36) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () %c28 = vm.const.i32 28 %c13 = vm.const.i32 13 %zero_37 = vm.const.i32.zero vm.call @hal.command_buffer.execution_barrier(%ref_23, %c28, %c13, %zero_37) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_23) : (!vm.ref) -> () %zero_38 = vm.const.i32.zero %ref_39 = vm.call @hal.fence.create(%__device_0, %zero_38) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_0, %ref_12, %ref_39, [%ref_23]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_39]) : (i32, !vm.ref ...) -> i32 vm.cond_fail %20, "failed to wait on timepoint" %ref_40 = vm.call.variadic @hal.buffer_view.create(%ref_15, %zero_1, %c2016, %c553648160, %c1_5, [%c2, %c4, %c7, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_40 : !vm.ref } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // module attributes {vm.toplevel} { vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.initializer { %null = vm.const.ref.zero : !vm.ref %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1 = vm.const.i64 1 %null_1 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_1 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %req = vm.cmp.eq.ref %4, %null_1 : !vm.ref %slt = vm.cmp.lt.i64.s %2, %1 : i64 %5 = vm.and.i32 %req, %slt : i32 vm.cond_br %5, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %6 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%6) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %7:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %7#1 : i64 %zero_2 = vm.const.i32.zero %8 = vm.select.i32 %7#0, %nz, %zero_2 : i32 %c1_3 = vm.const.i32 1 vm.cond_br %8, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %9:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_4 = vm.cmp.nz.i64 %9#1 : i64 %zero_5 = vm.const.i32.zero %10 = vm.select.i32 %9#0, %nz_4, %zero_5 : i32 %c1_6 = vm.const.i32 1 vm.br ^bb4(%10 : i32) ^bb4(%11: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %12 = vm.select.i64 %11, %c1, %zero_0 : i64 %13 = vm.add.i64 %3, %12 : i64 %14 = vm.and.i32 %11, %eq : i32 %ref_7 = vm.select.ref %14, %ref, %null_1 : !vm.ref %15 = vm.add.i64 %2, %c1 : i64 vm.br ^bb1(%15, %13, %ref_7 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %req, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.cond_fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]>" vm.br ^bb7 ^bb7: // 2 preds: ^bb5, ^bb6 %_utf8_hal_executable_format_1F9665C75F0004D3_8 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_9 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %16:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_8, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_9) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_10 = vm.cmp.nz.i64 %16#1 : i64 %zero_11 = vm.const.i32.zero %17 = vm.select.i32 %16#0, %nz_10, %zero_11 : i32 %c1_12 = vm.const.i32 1 %c1_13 = vm.const.i32 1 %zero_14 = vm.const.i32.zero %c7 = vm.const.i32 7 %c3 = vm.const.i32 3 %c1_15 = vm.const.i32 1 %c7_16 = vm.const.i32 7 %c3_17 = vm.const.i32 3 %c2 = vm.const.i32 2 %c7_18 = vm.const.i32 7 %c2_19 = vm.const.i32 2 %ref_20 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1_13, [(%zero_14, %c7, %c3), (%c1_15, %c7_16, %c3_17), (%c2, %c7_18, %c2_19)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %c8 = vm.const.i32 8 %ref_21 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_20]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %18 = vm.select.i64 %17, %zero_0, %c-1 : i64 %eq_22 = vm.cmp.eq.i64 %18, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_22, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_23 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %null_24 = vm.const.ref.zero : !vm.buffer %ref_25 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_23, %main_dispatch_0_embedded_elf_x86_64, %null_24, [%ref_21]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.br ^bb10(%ref_25 : !vm.ref) ^bb9: // pred: ^bb7 vm.cond_fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" vm.br ^bb10(%null : !vm.ref) ^bb10(%19: !vm.ref): // 2 preds: ^bb8, ^bb9 vm.global.store.ref %19, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_21, @__device_0_pipeline_layout_0 : !vm.ref vm.return } vm.import private @hal.ex.file.from_memory(%device : !vm.ref, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref vm.import private @hal.allocator.allocate(%allocator : !vm.ref, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.allocator.import(%allocator : !vm.ref, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref, %source_offset : i64, %length : i64) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer.length(%buffer : !vm.ref) -> i64 attributes {nosideeffects} vm.import private @hal.buffer.load(%source_buffer : !vm.ref, %source_offset : i64, %length : i32) -> i32 vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref, %target_offset : i64, %length : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref ...) vm.import private @hal.channel.create(%device : !vm.ref, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.split(%channel : !vm.ref, %color : i32, %key : i32, %flags : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.rank_and_count(%channel : !vm.ref) -> (i32, i32) attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref, %label : !vm.buffer) vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32) vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32) vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64) vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref, %channel : !vm.ref, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref, %recv_buffer : !vm.ref, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64) vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.dealloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %buffer : !vm.ref) vm.import private @hal.device.queue.read(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_file : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.write(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.ref, %source_offset : i64, %target_file : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %binding_table : tuple, i64, i64> ...) vm.import private @hal.device.queue.flush(%device : !vm.ref, %queue_affinity : i64) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create2(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {minimum_version = 4 : i32, nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.join(%fences : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.query(%fence : !vm.ref) -> i32 vm.import private @hal.fence.signal(%fence : !vm.ref) vm.import private @hal.fence.fail(%fence : !vm.ref, %status : i32) vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7 = vm.const.i64 7 %c2 = vm.const.i64 2 %c1 = vm.const.i64 1 %zero = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_0 = vm.const.i64 -1 %c32 = vm.const.i64 32 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %zero_1 = vm.const.i64.zero %c5 = vm.const.i64 5 %c4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %zero_2 = vm.const.i32.zero %0 = vm.call @hal.buffer_view.dim(%arg0, %zero_2) {nosideeffects} : (!vm.ref, i32) -> i64 %c1_3 = vm.const.i32 1 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1_3) {nosideeffects} : (!vm.ref, i32) -> i64 %c2_4 = vm.const.i32 2 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2_4) {nosideeffects} : (!vm.ref, i32) -> i64 %c3 = vm.const.i32 3 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %c553648160 = vm.const.i32 553648160 %c1_5 = vm.const.i32 1 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1_5, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_6 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer %c16 = vm.const.i32 16 %c3075 = vm.const.i32 3075 vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_6, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1_5, [%c4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_7 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A_8 = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer %c16_9 = vm.const.i32 16 %c3075_10 = vm.const.i32 3075 vm.call @hal.buffer.assert(%ref_7, %_utf8_tensor_41A152EEDB094D7A_8, %ref_6, %c2400, %c16_9, %c3075_10) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %zero_11 = vm.const.i32.zero %ref_12 = vm.call @hal.fence.create(%__device_0, %zero_11) : (!vm.ref, i32) -> !vm.ref %zero_13 = vm.const.i32.zero %c48 = vm.const.i32 48 %c3075_14 = vm.const.i32 3075 %ref_15 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_0, %null, %ref_12, %zero_13, %c48, %c3075_14, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %c32_16 = vm.const.i32 32 %9 = vm.shr.i64.u %0, %c32_16 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %c32_17 = vm.const.i32 32 %12 = vm.shr.i64.u %1, %c32_17 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %c32_18 = vm.const.i32 32 %15 = vm.shr.i64.u %2, %c32_18 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %c32_19 = vm.const.i32 32 %18 = vm.shr.i64.u %3, %c32_19 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %c1_20 = vm.const.i32 1 %c3_21 = vm.const.i32 3 %zero_22 = vm.const.i32.zero %ref_23 = vm.call @hal.command_buffer.create(%__device_0, %c1_20, %c3_21, %c-1_0, %zero_22) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref %zero_24 = vm.const.i32.zero vm.call.variadic @hal.command_buffer.push_constants(%ref_23, %__device_0_pipeline_layout_0, %zero_24, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) %zero_25 = vm.const.i32.zero %zero_26 = vm.const.i32.zero %zero_27 = vm.const.i32.zero %c1_28 = vm.const.i32 1 %zero_29 = vm.const.i32.zero %c2_30 = vm.const.i32 2 %zero_31 = vm.const.i32.zero vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_23, %__device_0_pipeline_layout_0, %zero_25, [(%zero_26, %zero_27, %ref, %zero_1, %7), (%c1_28, %zero_29, %ref_7, %zero_1, %c2400), (%c2_30, %zero_31, %ref_15, %zero_1, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) %zero_32 = vm.const.i32.zero %c7_33 = vm.const.i32 7 %c4_34 = vm.const.i32 4 %c1_35 = vm.const.i32 1 %zero_36 = vm.const.i64.zero vm.call @hal.command_buffer.dispatch(%ref_23, %__device_0_executable_0_main_dispatch_0, %zero_32, %c7_33, %c4_34, %c1_35, %zero_36) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () %c28 = vm.const.i32 28 %c13 = vm.const.i32 13 %zero_37 = vm.const.i32.zero vm.call @hal.command_buffer.execution_barrier(%ref_23, %c28, %c13, %zero_37) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_23) : (!vm.ref) -> () %zero_38 = vm.const.i32.zero %ref_39 = vm.call @hal.fence.create(%__device_0, %zero_38) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_0, %ref_12, %ref_39, [%ref_23]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_39]) : (i32, !vm.ref ...) -> i32 vm.cond_fail %20, "failed to wait on timepoint" %ref_40 = vm.call.variadic @hal.buffer_view.create(%ref_15, %zero_1, %c2016, %c553648160, %c1_5, [%c2, %c4, %c7, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_40 : !vm.ref } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } } // -----// IR Dump After Canonicalizer (canonicalize) //----- // module attributes {vm.toplevel} { vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.initializer { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_11 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %ref_12 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_11, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_12, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } vm.import private @hal.ex.file.from_memory(%device : !vm.ref, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref vm.import private @hal.allocator.allocate(%allocator : !vm.ref, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.allocator.import(%allocator : !vm.ref, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref, %source_offset : i64, %length : i64) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer.length(%buffer : !vm.ref) -> i64 attributes {nosideeffects} vm.import private @hal.buffer.load(%source_buffer : !vm.ref, %source_offset : i64, %length : i32) -> i32 vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref, %target_offset : i64, %length : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref ...) vm.import private @hal.channel.create(%device : !vm.ref, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.split(%channel : !vm.ref, %color : i32, %key : i32, %flags : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.rank_and_count(%channel : !vm.ref) -> (i32, i32) attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref, %label : !vm.buffer) vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32) vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32) vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64) vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref, %channel : !vm.ref, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref, %recv_buffer : !vm.ref, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64) vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.dealloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %buffer : !vm.ref) vm.import private @hal.device.queue.read(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_file : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.write(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.ref, %source_offset : i64, %target_file : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %binding_table : tuple, i64, i64> ...) vm.import private @hal.device.queue.flush(%device : !vm.ref, %queue_affinity : i64) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create2(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {minimum_version = 4 : i32, nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.join(%fences : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.query(%fence : !vm.ref) -> i32 vm.import private @hal.fence.signal(%fence : !vm.ref) vm.import private @hal.fence.fail(%fence : !vm.ref, %status : i32) vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A_7 = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A_7, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_8 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_9 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_8, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_10 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_10, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_10, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_9, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_10, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_10, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_10) : (!vm.ref) -> () %ref_11 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_8, %ref_11, [%ref_10]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_11]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2(%20 : i32), ^bb1 ^bb1: // pred: ^bb0 %ref_12 = vm.call.variadic @hal.buffer_view.create(%ref_9, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_12 : !vm.ref ^bb2(%21: i32): // pred: ^bb0 vm.fail %21, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } } // -----// IR Dump Before CSE (cse) //----- // module attributes {vm.toplevel} { vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.initializer { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_11 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %ref_12 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_11, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_12, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } vm.import private @hal.ex.file.from_memory(%device : !vm.ref, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref vm.import private @hal.allocator.allocate(%allocator : !vm.ref, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.allocator.import(%allocator : !vm.ref, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref, %source_offset : i64, %length : i64) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer.length(%buffer : !vm.ref) -> i64 attributes {nosideeffects} vm.import private @hal.buffer.load(%source_buffer : !vm.ref, %source_offset : i64, %length : i32) -> i32 vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref, %target_offset : i64, %length : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref ...) vm.import private @hal.channel.create(%device : !vm.ref, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.split(%channel : !vm.ref, %color : i32, %key : i32, %flags : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.rank_and_count(%channel : !vm.ref) -> (i32, i32) attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref, %label : !vm.buffer) vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32) vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32) vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64) vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref, %channel : !vm.ref, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref, %recv_buffer : !vm.ref, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64) vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.dealloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %buffer : !vm.ref) vm.import private @hal.device.queue.read(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_file : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.write(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.ref, %source_offset : i64, %target_file : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %binding_table : tuple, i64, i64> ...) vm.import private @hal.device.queue.flush(%device : !vm.ref, %queue_affinity : i64) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create2(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {minimum_version = 4 : i32, nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.join(%fences : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.query(%fence : !vm.ref) -> i32 vm.import private @hal.fence.signal(%fence : !vm.ref) vm.import private @hal.fence.fail(%fence : !vm.ref, %status : i32) vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A_7 = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A_7, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_8 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_9 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_8, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_10 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_10, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_10, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_9, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_10, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_10, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_10) : (!vm.ref) -> () %ref_11 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_8, %ref_11, [%ref_10]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_11]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2(%20 : i32), ^bb1 ^bb1: // pred: ^bb0 %ref_12 = vm.call.variadic @hal.buffer_view.create(%ref_9, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_12 : !vm.ref ^bb2(%21: i32): // pred: ^bb0 vm.fail %21, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } } // -----// IR Dump After CSE (cse) //----- // module attributes {vm.toplevel} { vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.initializer { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } vm.import private @hal.ex.file.from_memory(%device : !vm.ref, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref vm.import private @hal.allocator.allocate(%allocator : !vm.ref, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.allocator.import(%allocator : !vm.ref, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref, %source_offset : i64, %length : i64) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer.length(%buffer : !vm.ref) -> i64 attributes {nosideeffects} vm.import private @hal.buffer.load(%source_buffer : !vm.ref, %source_offset : i64, %length : i32) -> i32 vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref, %target_offset : i64, %length : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref ...) vm.import private @hal.channel.create(%device : !vm.ref, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.split(%channel : !vm.ref, %color : i32, %key : i32, %flags : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.rank_and_count(%channel : !vm.ref) -> (i32, i32) attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref, %label : !vm.buffer) vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32) vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32) vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64) vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref, %channel : !vm.ref, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref, %recv_buffer : !vm.ref, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64) vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.dealloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %buffer : !vm.ref) vm.import private @hal.device.queue.read(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_file : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.write(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.ref, %source_offset : i64, %target_file : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %binding_table : tuple, i64, i64> ...) vm.import private @hal.device.queue.flush(%device : !vm.ref, %queue_affinity : i64) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create2(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {minimum_version = 4 : i32, nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.join(%fences : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.query(%fence : !vm.ref) -> i32 vm.import private @hal.fence.signal(%fence : !vm.ref) vm.import private @hal.fence.fail(%fence : !vm.ref, %status : i32) vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2(%20 : i32), ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2(%21: i32): // pred: ^bb0 vm.fail %21, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } } // -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- // module attributes {vm.toplevel} { vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.initializer { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } vm.import private @hal.ex.file.from_memory(%device : !vm.ref, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref vm.import private @hal.allocator.allocate(%allocator : !vm.ref, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.allocator.import(%allocator : !vm.ref, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref, %source_offset : i64, %length : i64) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer.length(%buffer : !vm.ref) -> i64 attributes {nosideeffects} vm.import private @hal.buffer.load(%source_buffer : !vm.ref, %source_offset : i64, %length : i32) -> i32 vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref, %target_offset : i64, %length : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref ...) vm.import private @hal.channel.create(%device : !vm.ref, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.split(%channel : !vm.ref, %color : i32, %key : i32, %flags : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.rank_and_count(%channel : !vm.ref) -> (i32, i32) attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref, %label : !vm.buffer) vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32) vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32) vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64) vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref, %channel : !vm.ref, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref, %recv_buffer : !vm.ref, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64) vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.dealloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %buffer : !vm.ref) vm.import private @hal.device.queue.read(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_file : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.write(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.ref, %source_offset : i64, %target_file : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %binding_table : tuple, i64, i64> ...) vm.import private @hal.device.queue.flush(%device : !vm.ref, %queue_affinity : i64) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create2(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {minimum_version = 4 : i32, nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.join(%fences : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.query(%fence : !vm.ref) -> i32 vm.import private @hal.fence.signal(%fence : !vm.ref) vm.import private @hal.fence.fail(%fence : !vm.ref, %status : i32) vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2(%20 : i32), ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2(%21: i32): // pred: ^bb0 vm.fail %21, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } } // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // module attributes {vm.toplevel} { vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.initializer { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } vm.import private @hal.ex.file.from_memory(%device : !vm.ref, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref vm.import private @hal.allocator.allocate(%allocator : !vm.ref, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.allocator.import(%allocator : !vm.ref, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref, %source_offset : i64, %length : i64) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer.length(%buffer : !vm.ref) -> i64 attributes {nosideeffects} vm.import private @hal.buffer.load(%source_buffer : !vm.ref, %source_offset : i64, %length : i32) -> i32 vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref, %target_offset : i64, %length : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref ...) vm.import private @hal.channel.create(%device : !vm.ref, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.split(%channel : !vm.ref, %color : i32, %key : i32, %flags : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.rank_and_count(%channel : !vm.ref) -> (i32, i32) attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref, %label : !vm.buffer) vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32) vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32) vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64) vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref, %channel : !vm.ref, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref, %recv_buffer : !vm.ref, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64) vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.dealloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %buffer : !vm.ref) vm.import private @hal.device.queue.read(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_file : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.write(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.ref, %source_offset : i64, %target_file : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %binding_table : tuple, i64, i64> ...) vm.import private @hal.device.queue.flush(%device : !vm.ref, %queue_affinity : i64) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create2(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {minimum_version = 4 : i32, nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.join(%fences : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.query(%fence : !vm.ref) -> i32 vm.import private @hal.fence.signal(%fence : !vm.ref) vm.import private @hal.fence.fail(%fence : !vm.ref, %status : i32) vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } } // -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- // module attributes {vm.toplevel} { vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.initializer { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } vm.import private @hal.ex.file.from_memory(%device : !vm.ref, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref vm.import private @hal.allocator.allocate(%allocator : !vm.ref, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.allocator.import(%allocator : !vm.ref, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref, %source_offset : i64, %length : i64) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer.length(%buffer : !vm.ref) -> i64 attributes {nosideeffects} vm.import private @hal.buffer.load(%source_buffer : !vm.ref, %source_offset : i64, %length : i32) -> i32 vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref, %target_offset : i64, %length : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref ...) vm.import private @hal.channel.create(%device : !vm.ref, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.split(%channel : !vm.ref, %color : i32, %key : i32, %flags : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.rank_and_count(%channel : !vm.ref) -> (i32, i32) attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref, %label : !vm.buffer) vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32) vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32) vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64) vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref, %channel : !vm.ref, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref, %recv_buffer : !vm.ref, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64) vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.dealloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %buffer : !vm.ref) vm.import private @hal.device.queue.read(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_file : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.write(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.ref, %source_offset : i64, %target_file : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %binding_table : tuple, i64, i64> ...) vm.import private @hal.device.queue.flush(%device : !vm.ref, %queue_affinity : i64) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create2(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {minimum_version = 4 : i32, nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.join(%fences : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.query(%fence : !vm.ref) -> i32 vm.import private @hal.fence.signal(%fence : !vm.ref) vm.import private @hal.fence.fail(%fence : !vm.ref, %status : i32) vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } } // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // module attributes {vm.toplevel} { vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.initializer { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } vm.import private @hal.ex.file.from_memory(%device : !vm.ref, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref vm.import private @hal.allocator.allocate(%allocator : !vm.ref, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.allocator.import(%allocator : !vm.ref, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref, %source_offset : i64, %length : i64) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer.length(%buffer : !vm.ref) -> i64 attributes {nosideeffects} vm.import private @hal.buffer.load(%source_buffer : !vm.ref, %source_offset : i64, %length : i32) -> i32 vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref, %target_offset : i64, %length : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref ...) vm.import private @hal.channel.create(%device : !vm.ref, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.split(%channel : !vm.ref, %color : i32, %key : i32, %flags : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.rank_and_count(%channel : !vm.ref) -> (i32, i32) attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref, %label : !vm.buffer) vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32) vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32) vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64) vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref, %channel : !vm.ref, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref, %recv_buffer : !vm.ref, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64) vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.dealloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %buffer : !vm.ref) vm.import private @hal.device.queue.read(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_file : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.write(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.ref, %source_offset : i64, %target_file : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %binding_table : tuple, i64, i64> ...) vm.import private @hal.device.queue.flush(%device : !vm.ref, %queue_affinity : i64) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create2(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {minimum_version = 4 : i32, nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.join(%fences : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.query(%fence : !vm.ref) -> i32 vm.import private @hal.fence.signal(%fence : !vm.ref) vm.import private @hal.fence.fail(%fence : !vm.ref, %status : i32) vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } } // -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- // module attributes {vm.toplevel} { vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.initializer { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } vm.import private @hal.ex.file.from_memory(%device : !vm.ref, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref vm.import private @hal.allocator.allocate(%allocator : !vm.ref, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.allocator.import(%allocator : !vm.ref, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref, %source_offset : i64, %length : i64) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer.length(%buffer : !vm.ref) -> i64 attributes {nosideeffects} vm.import private @hal.buffer.load(%source_buffer : !vm.ref, %source_offset : i64, %length : i32) -> i32 vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref, %target_offset : i64, %length : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref ...) vm.import private @hal.channel.create(%device : !vm.ref, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.split(%channel : !vm.ref, %color : i32, %key : i32, %flags : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.rank_and_count(%channel : !vm.ref) -> (i32, i32) attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref, %label : !vm.buffer) vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32) vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32) vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64) vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref, %channel : !vm.ref, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref, %recv_buffer : !vm.ref, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64) vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.dealloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %buffer : !vm.ref) vm.import private @hal.device.queue.read(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_file : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.write(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.ref, %source_offset : i64, %target_file : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %binding_table : tuple, i64, i64> ...) vm.import private @hal.device.queue.flush(%device : !vm.ref, %queue_affinity : i64) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create2(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {minimum_version = 4 : i32, nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.join(%fences : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.query(%fence : !vm.ref) -> i32 vm.import private @hal.fence.signal(%fence : !vm.ref) vm.import private @hal.fence.fail(%fence : !vm.ref, %status : i32) vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } } // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // module attributes {vm.toplevel} { vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.initializer { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } vm.import private @hal.ex.file.from_memory(%device : !vm.ref, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref vm.import private @hal.allocator.allocate(%allocator : !vm.ref, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.allocator.import(%allocator : !vm.ref, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref, %source_offset : i64, %length : i64) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer.length(%buffer : !vm.ref) -> i64 attributes {nosideeffects} vm.import private @hal.buffer.load(%source_buffer : !vm.ref, %source_offset : i64, %length : i32) -> i32 vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref, %target_offset : i64, %length : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref ...) vm.import private @hal.channel.create(%device : !vm.ref, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.split(%channel : !vm.ref, %color : i32, %key : i32, %flags : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.rank_and_count(%channel : !vm.ref) -> (i32, i32) attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref, %label : !vm.buffer) vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32) vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32) vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64) vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref, %channel : !vm.ref, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref, %recv_buffer : !vm.ref, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64) vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.dealloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %buffer : !vm.ref) vm.import private @hal.device.queue.read(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_file : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.write(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.ref, %source_offset : i64, %target_file : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %binding_table : tuple, i64, i64> ...) vm.import private @hal.device.queue.flush(%device : !vm.ref, %queue_affinity : i64) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create2(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {minimum_version = 4 : i32, nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.join(%fences : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.query(%fence : !vm.ref) -> i32 vm.import private @hal.fence.signal(%fence : !vm.ref) vm.import private @hal.fence.fail(%fence : !vm.ref, %status : i32) vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } } // -----// IR Dump Before mlir::iree_compiler::IREE::VM::ResolveRodataLoadsPass (iree-vm-resolve-rodata-loads) //----- // vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.initializer { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } vm.import private @hal.ex.file.from_memory(%device : !vm.ref, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref vm.import private @hal.allocator.allocate(%allocator : !vm.ref, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.allocator.import(%allocator : !vm.ref, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref, %source_offset : i64, %length : i64) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer.length(%buffer : !vm.ref) -> i64 attributes {nosideeffects} vm.import private @hal.buffer.load(%source_buffer : !vm.ref, %source_offset : i64, %length : i32) -> i32 vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref, %target_offset : i64, %length : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref ...) vm.import private @hal.channel.create(%device : !vm.ref, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.split(%channel : !vm.ref, %color : i32, %key : i32, %flags : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.rank_and_count(%channel : !vm.ref) -> (i32, i32) attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref, %label : !vm.buffer) vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32) vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32) vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64) vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref, %channel : !vm.ref, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref, %recv_buffer : !vm.ref, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64) vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.dealloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %buffer : !vm.ref) vm.import private @hal.device.queue.read(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_file : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.write(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.ref, %source_offset : i64, %target_file : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %binding_table : tuple, i64, i64> ...) vm.import private @hal.device.queue.flush(%device : !vm.ref, %queue_affinity : i64) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create2(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {minimum_version = 4 : i32, nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.join(%fences : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.query(%fence : !vm.ref) -> i32 vm.import private @hal.fence.signal(%fence : !vm.ref) vm.import private @hal.fence.fail(%fence : !vm.ref, %status : i32) vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } // -----// IR Dump After mlir::iree_compiler::IREE::VM::ResolveRodataLoadsPass (iree-vm-resolve-rodata-loads) //----- // vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.initializer { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } vm.import private @hal.ex.file.from_memory(%device : !vm.ref, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref vm.import private @hal.allocator.allocate(%allocator : !vm.ref, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.allocator.import(%allocator : !vm.ref, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref, %source_offset : i64, %length : i64) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer.length(%buffer : !vm.ref) -> i64 attributes {nosideeffects} vm.import private @hal.buffer.load(%source_buffer : !vm.ref, %source_offset : i64, %length : i32) -> i32 vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref, %target_offset : i64, %length : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref ...) vm.import private @hal.channel.create(%device : !vm.ref, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.split(%channel : !vm.ref, %color : i32, %key : i32, %flags : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.rank_and_count(%channel : !vm.ref) -> (i32, i32) attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref, %label : !vm.buffer) vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32) vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32) vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64) vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref, %channel : !vm.ref, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref, %recv_buffer : !vm.ref, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64) vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.dealloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %buffer : !vm.ref) vm.import private @hal.device.queue.read(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_file : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.write(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.ref, %source_offset : i64, %target_file : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %binding_table : tuple, i64, i64> ...) vm.import private @hal.device.queue.flush(%device : !vm.ref, %queue_affinity : i64) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create2(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {minimum_version = 4 : i32, nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.join(%fences : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.query(%fence : !vm.ref) -> i32 vm.import private @hal.fence.signal(%fence : !vm.ref) vm.import private @hal.fence.fail(%fence : !vm.ref, %status : i32) vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } // -----// IR Dump Before Inliner (inline) //----- // module attributes {vm.toplevel} { vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.initializer { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } vm.import private @hal.ex.file.from_memory(%device : !vm.ref, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref vm.import private @hal.allocator.allocate(%allocator : !vm.ref, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.allocator.import(%allocator : !vm.ref, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref, %source_offset : i64, %length : i64) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer.length(%buffer : !vm.ref) -> i64 attributes {nosideeffects} vm.import private @hal.buffer.load(%source_buffer : !vm.ref, %source_offset : i64, %length : i32) -> i32 vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref, %target_offset : i64, %length : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref ...) vm.import private @hal.channel.create(%device : !vm.ref, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.split(%channel : !vm.ref, %color : i32, %key : i32, %flags : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.rank_and_count(%channel : !vm.ref) -> (i32, i32) attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref, %label : !vm.buffer) vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32) vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32) vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64) vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref, %channel : !vm.ref, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref, %recv_buffer : !vm.ref, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64) vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.dealloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %buffer : !vm.ref) vm.import private @hal.device.queue.read(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_file : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.write(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.ref, %source_offset : i64, %target_file : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %binding_table : tuple, i64, i64> ...) vm.import private @hal.device.queue.flush(%device : !vm.ref, %queue_affinity : i64) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create2(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {minimum_version = 4 : i32, nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.join(%fences : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.query(%fence : !vm.ref) -> i32 vm.import private @hal.fence.signal(%fence : !vm.ref) vm.import private @hal.fence.fail(%fence : !vm.ref, %status : i32) vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // vm.initializer { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } // -----// IR Dump After Canonicalizer (canonicalize) //----- // vm.initializer { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } // -----// IR Dump After Canonicalizer (canonicalize) //----- // vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } // -----// IR Dump After Inliner (inline) //----- // module attributes {vm.toplevel} { vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.initializer { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } vm.import private @hal.ex.file.from_memory(%device : !vm.ref, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref vm.import private @hal.allocator.allocate(%allocator : !vm.ref, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.allocator.import(%allocator : !vm.ref, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref, %source_offset : i64, %length : i64) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer.length(%buffer : !vm.ref) -> i64 attributes {nosideeffects} vm.import private @hal.buffer.load(%source_buffer : !vm.ref, %source_offset : i64, %length : i32) -> i32 vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref, %target_offset : i64, %length : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref ...) vm.import private @hal.channel.create(%device : !vm.ref, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.split(%channel : !vm.ref, %color : i32, %key : i32, %flags : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.rank_and_count(%channel : !vm.ref) -> (i32, i32) attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref, %label : !vm.buffer) vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32) vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32) vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64) vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref, %channel : !vm.ref, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref, %recv_buffer : !vm.ref, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64) vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.dealloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %buffer : !vm.ref) vm.import private @hal.device.queue.read(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_file : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.write(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.ref, %source_offset : i64, %target_file : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %binding_table : tuple, i64, i64> ...) vm.import private @hal.device.queue.flush(%device : !vm.ref, %queue_affinity : i64) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create2(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {minimum_version = 4 : i32, nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.join(%fences : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.query(%fence : !vm.ref) -> i32 vm.import private @hal.fence.signal(%fence : !vm.ref) vm.import private @hal.fence.fail(%fence : !vm.ref, %status : i32) vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } } // -----// IR Dump Before SymbolDCE (symbol-dce) //----- // module attributes {vm.toplevel} { vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.initializer { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } vm.import private @hal.ex.file.from_memory(%device : !vm.ref, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref vm.import private @hal.allocator.allocate(%allocator : !vm.ref, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.allocator.import(%allocator : !vm.ref, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref, %source_offset : i64, %length : i64) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer.length(%buffer : !vm.ref) -> i64 attributes {nosideeffects} vm.import private @hal.buffer.load(%source_buffer : !vm.ref, %source_offset : i64, %length : i32) -> i32 vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref, %target_offset : i64, %length : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref ...) vm.import private @hal.channel.create(%device : !vm.ref, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.split(%channel : !vm.ref, %color : i32, %key : i32, %flags : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.channel.rank_and_count(%channel : !vm.ref) -> (i32, i32) attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref, %label : !vm.buffer) vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32) vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32) vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64) vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref, %channel : !vm.ref, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref, %recv_buffer : !vm.ref, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64) vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) attributes {minimum_version = 4 : i32} vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.dealloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %buffer : !vm.ref) vm.import private @hal.device.queue.read(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_file : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.write(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.ref, %source_offset : i64, %target_file : !vm.ref, %target_offset : i64, %length : i64, %flags : i32) vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %binding_table : tuple, i64, i64> ...) vm.import private @hal.device.queue.flush(%device : !vm.ref, %queue_affinity : i64) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create2(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {minimum_version = 4 : i32, nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.join(%fences : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.query(%fence : !vm.ref) -> i32 vm.import private @hal.fence.signal(%fence : !vm.ref) vm.import private @hal.fence.fail(%fence : !vm.ref, %status : i32) vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } } // -----// IR Dump After SymbolDCE (symbol-dce) //----- // module attributes {vm.toplevel} { vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.initializer { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // module attributes {vm.toplevel} { vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.initializer { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } } // -----// IR Dump After Canonicalizer (canonicalize) //----- // module attributes {vm.toplevel} { vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.initializer { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } } // -----// IR Dump Before CSE (cse) //----- // module attributes {vm.toplevel} { vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.initializer { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } } // -----// IR Dump After CSE (cse) //----- // module attributes {vm.toplevel} { vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.initializer { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } } // -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- // module attributes {vm.toplevel} { vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.initializer { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } } // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // module attributes {vm.toplevel} { vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.initializer { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } } // -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- // module attributes {vm.toplevel} { vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.initializer { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } } // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // module attributes {vm.toplevel} { vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.initializer { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } } // -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- // module attributes {vm.toplevel} { vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.initializer { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } } // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // module attributes {vm.toplevel} { vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.initializer { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } } // -----// IR Dump Before mlir::iree_compiler::IREE::VM::GlobalInitializationPass (iree-vm-global-initialization) //----- // vm.module public @module { vm.global.ref private @__device_0 : !vm.ref vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.initializer { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} } // -----// IR Dump After mlir::iree_compiler::IREE::VM::GlobalInitializationPass (iree-vm-global-initialization) //----- // vm.module public @module { vm.global.ref private mutable @__device_0 : !vm.ref vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} vm.export @__init vm.func private @__init() { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.br ^bb10 ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" ^bb10: // pred: ^bb8 vm.return } vm.export @__deinit vm.func private @__deinit() { vm.return } } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // module attributes {vm.toplevel} { vm.module public @module { vm.global.ref private mutable @__device_0 : !vm.ref vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} vm.export @__init vm.func private @__init() { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.br ^bb10 ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" ^bb10: // pred: ^bb8 vm.return } vm.export @__deinit vm.func private @__deinit() { vm.return } } } // -----// IR Dump After Canonicalizer (canonicalize) //----- // module attributes {vm.toplevel} { vm.module public @module { vm.global.ref private mutable @__device_0 : !vm.ref vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} vm.export @__init vm.func private @__init() { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } vm.export @__deinit vm.func private @__deinit() { vm.return } } } // -----// IR Dump Before CSE (cse) //----- // module attributes {vm.toplevel} { vm.module public @module { vm.global.ref private mutable @__device_0 : !vm.ref vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} vm.export @__init vm.func private @__init() { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } vm.export @__deinit vm.func private @__deinit() { vm.return } } } // -----// IR Dump After CSE (cse) //----- // module attributes {vm.toplevel} { vm.module public @module { vm.global.ref private mutable @__device_0 : !vm.ref vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} vm.export @__init vm.func private @__init() { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } vm.export @__deinit vm.func private @__deinit() { vm.return } } } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // module attributes {vm.toplevel} { vm.module public @module { vm.global.ref private mutable @__device_0 : !vm.ref vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} vm.export @__init vm.func private @__init() { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } vm.export @__deinit vm.func private @__deinit() { vm.return } } } // -----// IR Dump After Canonicalizer (canonicalize) //----- // module attributes {vm.toplevel} { vm.module public @module { vm.global.ref private mutable @__device_0 : !vm.ref vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} vm.export @__init vm.func private @__init() { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } vm.export @__deinit vm.func private @__deinit() { vm.return } } } // -----// IR Dump Before mlir::iree_compiler::IREE::VM::DropEmptyModuleInitializersPass (iree-vm-drop-empty-module-initializers) //----- // vm.module public @module { vm.global.ref private mutable @__device_0 : !vm.ref vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} vm.export @__init vm.func private @__init() { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } vm.export @__deinit vm.func private @__deinit() { vm.return } } // -----// IR Dump After mlir::iree_compiler::IREE::VM::DropEmptyModuleInitializersPass (iree-vm-drop-empty-module-initializers) //----- // vm.module public @module { vm.global.ref private mutable @__device_0 : !vm.ref vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} vm.export @__init vm.func private @__init() { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } } // -----// IR Dump Before DropCompilerHints (iree-util-drop-compiler-hints) //----- // module attributes {vm.toplevel} { vm.module public @module { vm.global.ref private mutable @__device_0 : !vm.ref vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} vm.export @__init vm.func private @__init() { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } } } // -----// IR Dump After DropCompilerHints (iree-util-drop-compiler-hints) //----- // module attributes {vm.toplevel} { vm.module public @module { vm.global.ref private mutable @__device_0 : !vm.ref vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} vm.export @__init vm.func private @__init() { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } } } // -----// IR Dump Before mlir::iree_compiler::IREE::VM::GlobalInitializationPass (iree-vm-global-initialization) //----- // vm.module public @module { vm.global.ref private mutable @__device_0 : !vm.ref vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} vm.export @__init vm.func private @__init() { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } } // -----// IR Dump After mlir::iree_compiler::IREE::VM::GlobalInitializationPass (iree-vm-global-initialization) //----- // vm.module public @module { vm.global.ref private mutable @__device_0 : !vm.ref vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} vm.export @__init vm.func private @__init() { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.br ^bb10 ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" ^bb10: // pred: ^bb8 vm.return } vm.export @__deinit vm.func private @__deinit() { vm.return } } // -----// IR Dump Before mlir::iree_compiler::IREE::VM::DropEmptyModuleInitializersPass (iree-vm-drop-empty-module-initializers) //----- // vm.module public @module { vm.global.ref private mutable @__device_0 : !vm.ref vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} vm.export @__init vm.func private @__init() { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.br ^bb10 ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" ^bb10: // pred: ^bb8 vm.return } vm.export @__deinit vm.func private @__deinit() { vm.return } } // -----// IR Dump After mlir::iree_compiler::IREE::VM::DropEmptyModuleInitializersPass (iree-vm-drop-empty-module-initializers) //----- // vm.module public @module { vm.global.ref private mutable @__device_0 : !vm.ref vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} vm.export @__init vm.func private @__init() { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.br ^bb10 ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" ^bb10: // pred: ^bb8 vm.return } } // -----// IR Dump Before Inliner (inline) //----- // vm.module public @module { vm.global.ref private mutable @__device_0 : !vm.ref vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} vm.export @__init vm.func private @__init() { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.br ^bb10 ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" ^bb10: // pred: ^bb8 vm.return } } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } // -----// IR Dump After Canonicalizer (canonicalize) //----- // vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // vm.func private @__init() { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.br ^bb10 ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" ^bb10: // pred: ^bb8 vm.return } // -----// IR Dump After Canonicalizer (canonicalize) //----- // vm.func private @__init() { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } // -----// IR Dump After Inliner (inline) //----- // vm.module public @module { vm.global.ref private mutable @__device_0 : !vm.ref vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} vm.export @__init vm.func private @__init() { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } } // -----// IR Dump Before CSE (cse) //----- // vm.module public @module { vm.global.ref private mutable @__device_0 : !vm.ref vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} vm.export @__init vm.func private @__init() { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } } // -----// IR Dump After CSE (cse) //----- // vm.module public @module { vm.global.ref private mutable @__device_0 : !vm.ref vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} vm.export @__init vm.func private @__init() { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } } // -----// IR Dump Before Canonicalizer (canonicalize) //----- // vm.module public @module { vm.global.ref private mutable @__device_0 : !vm.ref vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} vm.export @__init vm.func private @__init() { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } } // -----// IR Dump After Canonicalizer (canonicalize) //----- // vm.module public @module { vm.global.ref private mutable @__device_0 : !vm.ref vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} vm.export @__init vm.func private @__init() { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } } // -----// IR Dump Before DropCompilerHints (iree-util-drop-compiler-hints) //----- // vm.module public @module { vm.global.ref private mutable @__device_0 : !vm.ref vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} vm.export @__init vm.func private @__init() { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } } // -----// IR Dump After DropCompilerHints (iree-util-drop-compiler-hints) //----- // vm.module public @module { vm.global.ref private mutable @__device_0 : !vm.ref vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} vm.export @__init vm.func private @__init() { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } } // -----// IR Dump Before mlir::iree_compiler::IREE::VM::OrdinalAllocationPass (iree-vm-ordinal-allocation) //----- // vm.module public @module { vm.global.ref private mutable @__device_0 : !vm.ref vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} vm.export @__init vm.func private @__init() { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } } // -----// IR Dump After mlir::iree_compiler::IREE::VM::OrdinalAllocationPass (iree-vm-ordinal-allocation) //----- // vm.module public @module attributes {ordinal_counts = #vm.ordinal_counts} { vm.global.ref private mutable @__device_0 {ordinal = 0 : i32} : !vm.ref vm.global.ref private mutable @__device_0_pipeline_layout_0 {ordinal = 1 : i32} : !vm.ref vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 {ordinal = 2 : i32} : !vm.ref vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64, ordinal = 0 : i32} "hal.device.id" vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64, ordinal = 1 : i32} "local*" vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64, ordinal = 2 : i32} "hal.executable.format" vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64, ordinal = 3 : i32} "embedded-elf-x86_64" vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf", ordinal = 4 : i32} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8> vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) attributes {ordinal = 0 : i32} vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects, ordinal = 1 : i32} vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) attributes {ordinal = 2 : i32} vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects, ordinal = 3 : i32} vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects, ordinal = 4 : i32} vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 3 : i32, ordinal = 5 : i32} vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) attributes {ordinal = 6 : i32} vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) attributes {ordinal = 7 : i32} vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %offset : i32, %values : i32 ...) attributes {ordinal = 8 : i32} vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref, %pipeline_layout : !vm.ref, %set : i32, %bindings : tuple, i64, i64> ...) attributes {ordinal = 9 : i32} vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) attributes {ordinal = 10 : i32} vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref, %flags : i32, %bindings : tuple ...) -> !vm.ref attributes {nosideeffects, ordinal = 11 : i32} vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects, ordinal = 12 : i32} vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects, ordinal = 13 : i32} vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref attributes {ordinal = 14 : i32} vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffers : !vm.ref ...) attributes {ordinal = 15 : i32} vm.import private @hal.devices.count() -> i32 attributes {nosideeffects, ordinal = 16 : i32} vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects, ordinal = 17 : i32} vm.import private @hal.executable.create(%device : !vm.ref, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects, ordinal = 18 : i32} vm.import private @hal.fence.create(%device : !vm.ref, %flags : i32) -> !vm.ref attributes {ordinal = 19 : i32} vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref ...) -> i32 attributes {ordinal = 20 : i32, vm.yield} vm.import private @hal.pipeline_layout.create(%device : !vm.ref, %push_constants : i32, %set_layouts : !vm.ref ...) -> !vm.ref attributes {nosideeffects, ordinal = 21 : i32} vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64, ordinal = 5 : i32} "input0" vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64, ordinal = 6 : i32} "tensor" vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64, ordinal = 7 : i32} "input1" vm.func private @main(%arg0: !vm.ref, %arg1: !vm.ref) -> !vm.ref attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}, ordinal = 0 : i32} { %c13 = vm.const.i32 13 %c28 = vm.const.i32 28 %c4 = vm.const.i32 4 %c7 = vm.const.i32 7 %c32 = vm.const.i32 32 %c48 = vm.const.i32 48 %c3075 = vm.const.i32 3075 %c16 = vm.const.i32 16 %c553648160 = vm.const.i32 553648160 %c3 = vm.const.i32 3 %c2 = vm.const.i32 2 %c1 = vm.const.i32 1 %zero = vm.const.i32.zero %c9 = vm.const.i64 9 %c-1 = vm.const.i32 -1 %c7_0 = vm.const.i64 7 %c2_1 = vm.const.i64 2 %zero_2 = vm.const.i64.zero %null = vm.const.ref.zero : !vm.ref %c-1_3 = vm.const.i64 -1 %c2016 = vm.const.i64 2016 %c2400 = vm.const.i64 2400 %c5 = vm.const.i64 5 %c4_4 = vm.const.i64 4 %c6 = vm.const.i64 6 %__device_0 = vm.global.load.ref @__device_0 : !vm.ref %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref, i32) -> i64 %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref, i32) -> i64 %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref, i32) -> i64 %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref, i32) -> i64 %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %4 = vm.mul.i64 %0, %c4_4 : i64 %5 = vm.mul.i64 %4, %1 : i64 %6 = vm.mul.i64 %5, %2 : i64 %7 = vm.mul.i64 %6, %3 : i64 %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref, i64, !vm.ref, !vm.ref, i32, i32, i32, i64) -> !vm.ref %8 = vm.trunc.i64.i32 %0 : i64 -> i32 %9 = vm.shr.i64.u %0, %c32 : i64 %10 = vm.trunc.i64.i32 %9 : i64 -> i32 %11 = vm.trunc.i64.i32 %1 : i64 -> i32 %12 = vm.shr.i64.u %1, %c32 : i64 %13 = vm.trunc.i64.i32 %12 : i64 -> i32 %14 = vm.trunc.i64.i32 %2 : i64 -> i32 %15 = vm.shr.i64.u %2, %c32 : i64 %16 = vm.trunc.i64.i32 %15 : i64 -> i32 %17 = vm.trunc.i64.i32 %3 : i64 -> i32 %18 = vm.shr.i64.u %3, %c32 : i64 %19 = vm.trunc.i64.i32 %18 : i64 -> i32 %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref, !vm.ref, i32, i32 ...) vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref, !vm.ref, i32, tuple, i64, i64> ...) vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64) -> () vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref, i32, i32, i32) -> () vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref) -> () %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i32) -> !vm.ref vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref ...) %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref ...) -> i32 vm.cond_br %20, ^bb2, ^bb1 ^bb1: // pred: ^bb0 %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref vm.return %ref_11 : !vm.ref ^bb2: // pred: ^bb0 vm.fail %20, "failed to wait on timepoint" } vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}, ordinal = 0 : i32} vm.export @__init attributes {ordinal = 1 : i32} vm.func private @__init() attributes {ordinal = 1 : i32} { %null = vm.const.ref.zero : !vm.buffer %c8 = vm.const.i32 8 %c2 = vm.const.i32 2 %c3 = vm.const.i32 3 %c7 = vm.const.i32 7 %c1 = vm.const.i32 1 %c14 = vm.const.i32 14 %c-1 = vm.const.i64 -1 %c5 = vm.const.i32 5 %zero = vm.const.i32.zero %zero_0 = vm.const.i64.zero %c1_1 = vm.const.i64 1 %null_2 = vm.const.ref.zero : !vm.ref %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 %1 = vm.ext.i32.i64.s %0 : i32 -> i64 vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 %rnz = vm.cmp.nz.ref %4 : !vm.ref %5 = vm.xor.i32 %rnz, %c1 : i32 %slt = vm.cmp.lt.i64.s %2, %1 : i64 %6 = vm.and.i32 %5, %slt : i32 vm.cond_br %6, ^bb2, ^bb5 ^bb2: // pred: ^bb1 %7 = vm.trunc.i64.i32 %2 : i64 -> i32 %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz = vm.cmp.nz.i64 %8#1 : i64 %9 = vm.select.i32 %8#0, %nz, %zero : i32 vm.cond_br %9, ^bb3, ^bb4(%zero : i32) ^bb3: // pred: ^bb2 %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_3 = vm.cmp.nz.i64 %10#1 : i64 %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 vm.br ^bb4(%11 : i32) ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 %14 = vm.add.i64 %3, %13 : i64 %15 = vm.and.i32 %12, %eq : i32 %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref %16 = vm.add.i64 %2, %c1_1 : i64 vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) ^bb5: // pred: ^bb1 vm.cond_br %5, ^bb6, ^bb7 ^bb6: // pred: ^bb5 vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>" ^bb7: // pred: ^bb5 %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) %nz_7 = vm.cmp.nz.i64 %17#1 : i64 %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref, i32, tuple ...) -> !vm.ref %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref, i32, !vm.ref ...) -> !vm.ref %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64 vm.global.store.ref %4, @__device_0 : !vm.ref vm.cond_br %eq_10, ^bb8, ^bb9 ^bb8: // pred: ^bb7 %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref ...) -> !vm.ref vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref vm.return ^bb9: // pred: ^bb7 vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]" } }